Source code for niklib.data.functional

"""
Contains implementation of functions that could be used for processing data everywhere and
are not necessarily bounded to a class.

"""

# core
import pandas as pd
import xmltodict
import csv
# ours: data
from niklib.data.preprocessor import FileTransformCompose
# helpers
from typing import Callable, Optional, Union, cast
from enlighten import Manager
from fnmatch import fnmatch
import enlighten
import logging
import sys
import os


# set logger
logger = logging.getLogger(__name__)



[docs]
def dict_to_csv(d: dict, path: str) -> None:
    """Takes a flattened dictionary and writes it to a CSV file.

    Args:
        d (dict): A dictionary
        path (str): Path to the output file (will be created if not exist)
    """

    with open(path, 'w') as f:
        w = csv.DictWriter(f, d.keys())
        w.writeheader()
        w.writerow(d)




[docs]
def dump_directory_structure_csv(
    src: str,
    shallow: bool = True
) -> None:
    """Saves a tree structure of a directory in csv file

    Takes a ``src`` directory path, creates a tree of dir structure and writes
    it down to a csv file with name ``'label.csv'`` with
    default value of ``'0'`` for each path

    Note:
        This has been used to manually extract and record labels.

    Args:
        src (str): Source directory path
        shallow (bool, optional): If only dive one level of depth (False: recursive).
            Defaults to True.
    """

    dic = create_directory_structure_tree(src=src, shallow=shallow)
    flat_dic = flatten_dict(dic)
    flat_dic = {k: v for k, v in flat_dic.items() if v is not None}
    dict_to_csv(d=flat_dic, path=src+'/label.csv')




[docs]
def create_directory_structure_tree(src: str, shallow: bool = False) -> dict:
    """Takes a path to directory and creates a dictionary of its directory structure tree

    Args:
        src (str): Path to source directory
        shallow (bool, optional): Whether or not just dive to root dir's subdir.
            Defaults to False.

    Reference:
        1. https://stackoverflow.com/a/25226267/18971263

    Returns:
        dict:
            Dictionary of all dirs (and subdirs) where keys are path
            and values are ``0``
    """
    d = {'name': os.path.basename(src) if os.path.isdir(
        src) else None}  # ignore files, only dir
    if os.path.isdir(src):
        if shallow:
            d['children'] = [{x: '0'} for x in os.listdir(src)]  # type: ignore
        else:  # recursively walk into all dirs and subdirs
            d['children'] = [create_directory_structure_tree(  # type: ignore
                os.path.join(src, x)) for x in os.listdir(src)]
    else:
        pass
        # d['type'] = "file"
    return d




[docs]
def flatten_dict(d: dict) -> dict:
    """Takes a (nested) multilevel dictionary and flattens it

    Args:
        d (dict): A dictionary (could be multilevel)

    Reference:
        1. https://stackoverflow.com/a/67744709/18971263

    Returns:
        dict:
            Flattened dictionary where keys and values of returned dict are:
                - ``new_keys[i] = f'{old_leys[level]}.{old_leys[level+1]}.[...].{old_leys[level+n]}'``
                - ``new_value = old_value``

    """

    def items():
        if isinstance(d, dict):
            for key, value in d.items():
                # nested subtree
                if isinstance(value, dict):
                    for subkey, subvalue in flatten_dict(value).items():
                        yield '{}.{}'.format(key, subkey), subvalue
                # nested list
                elif isinstance(value, list):
                    for num, elem in enumerate(value):
                        for subkey, subvalue in flatten_dict(elem).items():
                            yield '{}.[{}].{}'.format(key, num, subkey), subvalue
                # everything else (only leafs should remain)
                else:
                    yield key, value
    return dict(items())




[docs]
def xml_to_flattened_dict(xml: str) -> dict:
    """Takes a (nested) XML and flattens it to a dict via :func:`flatten_dict`

    Args:
        xml (str): A XML string

    Returns:
        dict: A flattened dictionary of given XML
    """
    flattened_dict = xmltodict.parse(xml)  # XML to dict
    flattened_dict = flatten_dict(flattened_dict)
    return flattened_dict




[docs]
def process_directory(
    src_dir: str,
    dst_dir: str,
    compose: FileTransformCompose,
    file_pattern: str = '*',
    manager: Optional[Manager] = None
) -> None:
    """Transforms all files that match pattern in given dir and saves new files preserving dir structure

    Note:
        A methods used for handling files from manually processed dataset to raw-dataset
        see :class:`FileTransform <niklib.data.preprocessor.FileTransform>` for more information.

    References:
        1. https://stackoverflow.com/a/24041933/18971263

    Args:
        src_dir (str): Source directory to be processed
        dst_dir (str): Destination directory to write processed files
        compose (FileTransformCompose): An instance of transform composer.
            see :class:`niklib.data.preprocessor.FileTransformCompose`.
        file_pattern (str, optional): pattern to match files, default to ``'*'`` for
            all files. Defaults to ``'*'``.
        manager (Optional[Manager], optional): ``enlighten.Manager`` for progressbar.
            Defaults to None.
    """

    assert src_dir != dst_dir, 'Source and destination dir must differ.'
    if src_dir[-1] != '/':
        src_dir += '/'

    # logging
    manager = enlighten.get_manager(sys.stderr) if manager is None else manager
    progress_bar = manager.counter(
        total=len(next(os.walk(src_dir), (None, [], None))[1]),
        desc='Extracted',
        unit='data point files'
    )
    i = 0

    # process directories
    for dirpath, _, all_filenames in os.walk(src_dir):
        # filter out files that match pattern only
        filenames = filter(lambda fname: fnmatch(
            fname, file_pattern), all_filenames)
        dirname = dirpath[len(dirpath) - dirpath[::-1].find('/'):]
        logger.warning(f'Processing directory="{dirname}"...')
        if filenames:
            dir_ = os.path.join(dst_dir, dirpath.replace(src_dir, ''))
            os.makedirs(dir_, exist_ok=True)
            for fname in filenames:
                in_fname = os.path.join(dirpath, fname)  # original path
                out_fname = os.path.join(dir_, fname)  # processed path
                compose(in_fname, out_fname)  # composition of transforms
                logger.info(f'Processed file="{fname}"')
        logger.info(f'Processed "{i}th" data entry.')
        i += 1
        progress_bar.update()




[docs]
def extended_dict_get(
    string: str,
    dic: dict,
    if_nan: str,
    condition: Union[Callable, bool, None] = None
):
    """Takes a string and looks for it inside a dictionary with default value if condition is satisfied

    Args:
        string (str): the ``string`` to look for inside dictionary ``dic``
        dic (dict): the dictionary that ``string`` is expected to be
        if_nan (str): the value returned if ``string`` could not be found in ``dic``
        condition (Optional[bool], optional): look for ``string`` in ``dic`` only
            if ``condition`` is True 

    Examples:
        >>> d = {'1': 'a', '2': 'b', '3': 'c'}
        >>> extended_dict_get('1', d, 'z', str.isnumeric)
        'a'
        >>> extended_dict_get('x', d, 'z', str.isnumeric)
        'x'

    Returns:
        Any: Substituted value instead of `string`
    """

    condition = (lambda x: True) if condition is None else condition
    condition = cast(Callable, condition)

    # check given `condition` is true or not
    if condition(string):
        return dic.get(string, if_nan)  # look for `string` if not use `if_nan`
    else:
        logger.debug(
            f'"{string}" is not True for the given `condition`'
             ' ==> `false_condition_value` will be applied.')
        return string




[docs]
def config_csv_to_dict(path: str) -> dict:
    """Takes a config CSV and return a dictionary of key and values

    Note:
        Configs of our use case can be found in :mod:`niklib.configs`

    Args:
        path (str): string path to config file

    Returns:
        dict: A dictionary of converted csv
    """

    config_df = pd.read_csv(path)
    return dict(zip(config_df[config_df.columns[0]], config_df[config_df.columns[1]]))