Source code for disdrodb.l0.check_readers

import os
import glob
import shutil
import pandas as pd


from disdrodb.l0.l0_reader import get_station_reader
from disdrodb.api.io import _get_disdrodb_directory


# current file path
PACKAGE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
RAW_DIR = os.path.join(PACKAGE_DIR, "tests", "pytest_files", "check_readers", "DISDRODB")


[docs]def get_list_test_data_sources() -> list: """Get list of test data sources. Returns ------- list List of test data sources. """ list_of_data_sources = os.listdir(os.path.join(RAW_DIR, "Raw")) return list_of_data_sources
[docs]def get_list_test_campaigns(data_source: str) -> list: """Get list of test campaigns for a given data source. Parameters ---------- data_source : str Data source. Returns ------- list List of test campaigns. """ list_of_campaigns = os.listdir(os.path.join(RAW_DIR, "Raw", data_source)) return list_of_campaigns
[docs]def get_list_test_stations(data_source: str, campaign_name: str) -> list: """Get list of test stations for a given data source and campaign. Parameters ---------- data_source : str Data source. campaign_name : str Name of the campaign. Returns ------- list List of test stations. """ yml_files = glob.glob(os.path.join(RAW_DIR, "Raw", data_source, campaign_name, "metadata", "*.yml")) list_station_names = [os.path.splitext(os.path.basename(i))[0] for i in yml_files] return list_station_names
[docs]def is_parquet_files_identical(file1: str, file2: str) -> bool: """Check if two parquet files are identical. Parameters ---------- file1 : str Path to the first file. file2 : str Path to the second file. Returns ------- bool True if the two files are identical, False otherwise. """ df1 = pd.read_parquet(file1) df2 = pd.read_parquet(file2) return df1.equals(df2)
[docs]def run_reader_on_test_data(data_source: str, campaign_name: str) -> None: """Run reader over the data sample. Parameters ---------- data_source : str Data source. campaign_name : str Campaign name. """ for station_name in get_list_test_stations(data_source, campaign_name): reader = get_station_reader( disdrodb_dir=RAW_DIR, data_source=data_source, campaign_name=campaign_name, station_name=station_name, ) # Define raw_dir and process_dir raw_dir = _get_disdrodb_directory( disdrodb_dir=RAW_DIR, product_level="RAW", data_source=data_source, campaign_name=campaign_name, ) processed_dir = _get_disdrodb_directory( disdrodb_dir=RAW_DIR, product_level="L0A", data_source=data_source, campaign_name=campaign_name, check_exist=False, ) reader( raw_dir=raw_dir, processed_dir=processed_dir, station_name=station_name, force=True, verbose=False, debugging_mode=False, parallel=False, ) return processed_dir
[docs]def check_all_readers() -> None: """Test all readers that have data samples and ground truth. Raises ------ Exception If the reader validation has failed. """ for data_source in get_list_test_data_sources(): for campaign_name in get_list_test_campaigns(data_source): process_dir = run_reader_on_test_data(data_source, campaign_name) ground_truth = glob.glob( os.path.join(RAW_DIR, "Raw", data_source, campaign_name, "ground_truth", "*", "*.parquet") ) processed_file = glob.glob(os.path.join(process_dir, "L0A", "*", "*.parquet")) for i, ground_truth_fpath in enumerate(ground_truth): station_name = os.path.basename(os.path.dirname(ground_truth[i])) processed_file_fpath = processed_file[i] is_correct = is_parquet_files_identical(ground_truth_fpath, processed_file_fpath) if not is_correct: raise Exception( f"Reader validation has failed for data_source '{data_source}', campaign_name '{campaign_name}'" f" and station_name '{station_name}'" ) # Remove Processed directory if exists if os.path.exists(os.path.join(RAW_DIR, "Processed")): try: shutil.rmtree(os.path.join(RAW_DIR, "Processed")) except: pass