Source code for disdrodb.l0.l0_reader

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# -----------------------------------------------------------------------------.
# Copyright (c) 2021-2022 DISDRODB developers
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# -----------------------------------------------------------------------------.

import os
import logging

logger = logging.getLogger(__name__)


####--------------------------------------------------------------------------.


def _get_readers_directory() -> str:
    """Returns the path to the disdrodb.l0.readers directory within the disdrodb package."""
    # Current file path
    l0_folder_path = os.path.dirname(__file__)

    # Readers folder path
    reader_folder_path = os.path.join(l0_folder_path, "readers")
    return reader_folder_path


def _get_readers_data_sources() -> list:
    """Returns the readers data sources available at disdrodb.l0.readers"""
    # Readers folder path
    reader_folder_path = _get_readers_directory()

    # List of readers folder
    list_data_sources = [os.path.basename(f.path) for f in os.scandir(reader_folder_path) if f.is_dir()]
    # Directory to remove
    bad_dirs = ["__pycache__", ".ipynb_checkpoints"]
    list_data_sources = [name for name in list_data_sources if name not in bad_dirs]

    return list_data_sources


def _get_readers_data_sources_path() -> list:
    """Returns the list of readers data sources directory paths within disdrodb.l0.readers"""
    # Readers folder path
    reader_folder_path = _get_readers_directory()

    # List of readers folder
    list_data_sources = [f.path for f in os.scandir(reader_folder_path) if f.is_dir()]
    return list_data_sources


def _get_readers_paths_by_data_source(data_source):
    """Return the filepath list of available readers for a specific data source.

    This function does not check the data_source validity.
    """
    # Retrieve reader data source directory
    reader_folder_path = _get_readers_directory()
    reader_data_source_path = os.path.join(reader_folder_path, data_source)
    if not os.path.isdir(reader_data_source_path):
        raise ValueError(f"No {data_source} directory in disdrodb.l0.readers")
    # Retrieve list of available readers paths
    list_readers_paths = [f.path for f in os.scandir(reader_data_source_path) if f.is_file() and f.path.endswith(".py")]
    return list_readers_paths


def _get_readers_names_by_data_source(data_source):
    """Return the reader available for a given data_source.

    This function does not check the data_source validity.
    """
    # Retrieve reader data source directory
    list_readers_paths = _get_readers_paths_by_data_source(data_source)
    list_readers_names = [os.path.basename(path).replace(".py", "") for path in list_readers_paths]
    return list_readers_names


####--------------------------------------------------------------------------.


def _check_reader_data_source(reader_data_source: str) -> str:
    """Check if the provided data source exists within the available readers.

    Please run get_available_readers_dict() to get the list of all available reader.

    Parameters
    ----------
    reader_data_source : str
        The directory within which the reader_name is located in the
        disdrodb.l0.readers directory.

    Returns
    -------
    str
        If data source is valid, return the correct data source name

    Raises
    ------
    ValueError
        Error if the data source name provided is not a directory within the disdrodb.l0.readers directory.
    """

    # List available readers data sources
    available_reader_data_sources = _get_readers_data_sources()
    # If not valid data_source, raise error
    if reader_data_source not in available_reader_data_sources:
        msg = f"Reader data source {reader_data_source} is not a directory inside the disdrodb.l0.readers directory."
        logger.error(msg)
        raise ValueError(msg)
    return reader_data_source


[docs]def check_reader_exists(reader_data_source: str, reader_name: str) -> str:
    """Check if the provided data source exists and reader names exists within the available readers.

    Please run get_available_readers_dict() to get the list of all available reader.

    Parameters
    ----------
    reader_data_source : str
        The directory within which the reader_name is located in the
        disdrodb.l0.readers directory.
    reader_name : str
        Campaign name

    Returns
    -------
    str
        If True : returns the reader name
        If False : Error - return None

    Raises
    ------
    ValueError
        Error if the reader name provided for the campaign has not been found.
    """
    # Check valid data_source
    reader_data_source = _check_reader_data_source(reader_data_source)
    # Get available reader names
    list_readers_names = _get_readers_names_by_data_source(reader_data_source)
    # If not valid reader_name, raise error
    if reader_name not in list_readers_names:
        msg = f"Reader {reader_name} is not valid. Valid readers {list_readers_names}."
        logger.exception(msg)
        raise ValueError(msg)
    return reader_name


[docs]def get_available_readers_dict() -> dict:
    """Returns the readers description included into the current release of DISDRODB.

    Returns
    -------
    dict
        The dictionary has the following schema {"data_source": {"reader_name": "reader_file_path"}}
    """
    # Format:
    # {data_source: {reader_name: reader_path,
    #                reader_name1: reader_path1}
    # }
    # Get list of reader data sources
    list_reader_data_sources = _get_readers_data_sources()

    # Build dictionary
    dict_reader = {}
    for data_source in list_reader_data_sources:
        # Retrieve the filepath of the available readers
        list_readers_paths = _get_readers_paths_by_data_source(data_source)
        # Initialize the data_source dictionary
        dict_reader[data_source] = {}
        for reader_path in list_readers_paths:
            reader_name = os.path.basename(reader_path).replace(".py", "")
            dict_reader[data_source][reader_name] = reader_path
    # Return available dictionary
    return dict_reader


[docs]def available_readers(data_sources=None, reader_path=False):
    """Retrieve available readers information."""
    # Get available readers dictionary
    dict_readers = get_available_readers_dict()
    # If data sources is not None, subset the dictionary
    if data_sources is not None:
        # Check valid data sources
        if isinstance(data_sources, str):
            data_sources = [data_sources]
        data_sources = [_check_reader_data_source(data_source) for data_source in data_sources]
        # Create new dictionary
        dict_readers = {data_source: dict_readers[data_source] for data_source in data_sources}
    # If reader_path=False, provide {data_source: [list_reader_names]}
    if not reader_path:
        dict_readers = {data_source: list(dict_readers.keys()) for data_source, dict_readers in dict_readers.items()}
    return dict_readers


####--------------------------------------------------------------------------.


[docs]def get_reader(reader_data_source: str, reader_name: str) -> object:
    """Returns the reader function based on input parameters.

    Parameters
    ----------
    reader_data_source : str
        The directory within which the reader_name is located in the
        disdrodb.l0.readers directory.
    reader_name : str
        The reader name.

    Returns
    -------
    object
        The reader() function

    """
    # Check data source and reader_name validity
    reader_data_source = _check_reader_data_source(reader_data_source)
    reader_name = check_reader_exists(reader_data_source=reader_data_source, reader_name=reader_name)
    # Retrive reader function
    if reader_name:
        full_name = f"disdrodb.l0.readers.{reader_data_source}.{reader_name}.reader"
        module_name, unit_name = full_name.rsplit(".", 1)
        my_reader = getattr(__import__(module_name, fromlist=[""]), unit_name)

    return my_reader


####--------------------------------------------------------------------------.
#### Checks for reader args


def _get_expected_reader_arguments():
    """Return a list with the expected reader arguments."""
    expected_arguments = [
        "raw_dir",
        "processed_dir",
        "station_name",
        "force",
        "verbose",
        "parallel",
        "debugging_mode",
    ]
    return expected_arguments


[docs]def check_reader_arguments(reader):
    """Check the reader have the expected input arguments."""
    import inspect

    signature = inspect.signature(reader)
    reader_arguments = sorted(list(signature.parameters.keys()))
    expected_arguments = sorted(_get_expected_reader_arguments())
    if reader_arguments != expected_arguments:
        raise ValueError(f"The reader must be defined with the following arguments: {expected_arguments}")
    return None


####--------------------------------------------------------------------------.
#### Checks for metadata reader key


def _check_metadata_reader(metadata):
    """Check reader key is available and there is the associated reader."""
    # Check the reader is specified
    if "reader" not in metadata:
        raise ValueError("The reader is not specified in the metadata.")
    # If the reader name is specified, test it is valid.
    # - Convention: reader: "<data_source>/<reader_name>" in disdrodb.l0.readers
    reader_reference = metadata.get("reader")
    # - Check it contains /
    if "/" not in reader_reference:
        raise ValueError(
            f"The reader '{reader_reference}' reported in the metadata is not valid. Must have"
            " '<data_source>/<reader_name>' pattern."
        )
    # - Get the reader_reference component list
    reader_components = reader_reference.split("/")
    # - Check composed by two elements
    if len(reader_components) != 2:
        raise ValueError("Expecting the reader reference to be composed of <data_source>/<reader_name>.")
    # - Retrieve reader data source and reader name
    reader_data_source = reader_components[0]
    reader_name = reader_components[1]

    # - Check the reader is available
    check_reader_exists(reader_data_source=reader_data_source, reader_name=reader_name)

    return None


[docs]def get_reader_from_metadata_reader_key(reader_data_source_name):
    """Retrieve the reader from the `reader` metadata value.

    The convention for metadata reader key: <data_source/reader_name> in disdrodb.l0.readers
    """
    reader_data_source = reader_data_source_name.split("/")[0]
    reader_name = reader_data_source_name.split("/")[1]
    reader = get_reader(reader_data_source=reader_data_source, reader_name=reader_name)
    return reader


def _get_reader_from_metadata(metadata):
    """Retrieve the reader from the metadata key `reader`

    The convention for metadata reader key: <data_source/reader_name> in disdrodb.l0.readers
    """
    reader_data_source_name = metadata.get("reader")
    return get_reader_from_metadata_reader_key(reader_data_source_name)


[docs]def get_station_reader(disdrodb_dir, data_source, campaign_name, station_name):
    """Retrieve reader form station metadata information."""
    from disdrodb.api.io import get_metadata_dict

    # Get metadata
    metadata = get_metadata_dict(
        disdrodb_dir=disdrodb_dir,
        product_level="RAW",
        data_source=data_source,
        campaign_name=campaign_name,
        station_name=station_name,
    )
    # ------------------------------------------------------------------------.
    # Check reader key is within the dictionary
    if "reader" not in metadata:
        raise ValueError(
            "The `reader` key is not available in the metadata of the"
            f" {data_source} {campaign_name} {station_name} station."
        )

    # ------------------------------------------------------------------------.
    # Check reader name validity
    _check_metadata_reader(metadata)

    # ------------------------------------------------------------------------.
    # Retrieve reader
    reader = _get_reader_from_metadata(metadata)

    # ------------------------------------------------------------------------.
    # Check reader argument
    check_reader_arguments(reader)

    return reader


####--------------------------------------------------------------------------.
#### Readers Docs


[docs]def is_documented_by(original):
    """Wrapper function to apply generic docstring to the decorated function.

    Parameters
    ----------
    original : function
        Function to take the docstring from.
    """

    def wrapper(target):
        target.__doc__ = original.__doc__
        return target

    return wrapper


[docs]def reader_generic_docstring():
    """Script to convert the raw data to L0A format.

    Parameters
    ----------
    raw_dir : str
        The directory path where all the raw content of a specific campaign is stored.
        The path must have the following structure:
            <...>/DISDRODB/Raw/<data_source>/<campaign_name>'.
        Inside the raw_dir directory, it is required to adopt the following structure:
        - /data/<station_name>/<raw_files>
        - /metadata/<station_name>.yaml
        Important points:
        - For each <station_name> there must be a corresponding YAML file in the metadata subfolder.
        - The <campaign_name> must semantically match between:
           - the raw_dir and processed_dir directory paths;
           - with the key 'campaign_name' within the metadata YAML files.
        - The campaign_name are expected to be UPPER CASE.
    processed_dir : str
        The desired directory path for the processed DISDRODB L0A and L0B products.
        The path should have the following structure:
            <...>/DISDRODB/Processed/<data_source>/<campaign_name>'
        For testing purpose, this function exceptionally accept also a directory path simply ending
        with <campaign_name> (i.e. /tmp/<campaign_name>).
    station_name : str
        Station name
    force : bool
        If True, overwrite existing data into destination directories.
        If False, raise an error if there are already data into destination directories.
        The default is False.
    verbose : bool
        Whether to print detailed processing information into terminal.
        The default is True.
    parallel : bool
        If True, the files are processed simultanously in multiple processes.
        The number of simultaneous processes can be customized using the dask.distributed LocalCluster.
        If False, the files are processed sequentially in a single process.
        If False, multi-threading is automatically exploited to speed up I/0 tasks.
    debugging_mode : bool
        If True, it reduces the amount of data to process.
        It processes just the first 3 raw data files.
        The default is False.
    """


####--------------------------------------------------------------------------.
#### Check DISDRODB readers


[docs]def check_available_readers():
    """Check the readers arguments of all package."""
    dict_all_readers = available_readers(data_sources=None, reader_path=False)
    for reader_data_source, list_reader_name in dict_all_readers.items():
        for reader_name in list_reader_name:
            try:
                reader = get_reader(reader_data_source=reader_data_source, reader_name=reader_name)
                check_reader_arguments(reader)
            except Exception as e:
                raise ValueError(f"Unvalid reader for {reader_data_source}/{reader_name}.py. The error is {e}")
    return None


####--------------------------------------------------------------------------.