Source code for niklib.data.preprocessor

# core
from dateutil.relativedelta import *
import pikepdf
# helpers
from typing import Any
import shutil
import logging


# logging
logger = logging.getLogger(__name__)


[docs] class FileTransform: """A base class for applying transforms as a composable object over files. Any behavior over the files itself (not the content of files) must extend this class. """
[docs] def __init__(self) -> None: pass
[docs] def __call__(self, src: str, dst: str, *args: Any, **kwds: Any) -> Any: """ Args: src: source file to be processed dst: the pass that the processed file to be saved """ pass
[docs] class CopyFile(FileTransform): """Only copies a file, a wrapper around ``shutil`` 's copying methods Default is set to ``'cf'``, i.e. :func:`shutil.copyfile`. For more info see shutil_ documentation. Reference: 1. https://stackoverflow.com/a/30359308/18971263 """
[docs] def __init__(self, mode: str) -> None: super().__init__() self.COPY_MODES = ['c', 'cf', 'c2'] self.mode = mode if mode is not None else 'cf' self.__check_mode(mode=mode)
[docs] def __call__(self, src: str, dst: str, *args: Any, **kwds: Any) -> Any: if self.mode == 'c': shutil.copy(src=src, dst=dst) elif self.mode == 'cf': shutil.copyfile(src=src, dst=dst) elif self.mode == 'c2': shutil.copy2(src=src, dst=dst)
def __check_mode(self, mode: str): """Checks copying mode to be available in shutil_ Args: mode: copying mode in ``shutil``, one of ``'c'``, ``'cf'``, ``'c2'`` .. _shutil: https://docs.python.org/3/library/shutil.html """ if not mode in self.COPY_MODES: raise ValueError( f'Mode {mode} does not exist,', ' choose one of "{self.COPY_MODES}".' )
[docs] class MakeContentCopyProtectedMachineReadable(FileTransform): """Reads a ``'content-copy'`` protected PDF and removes this restriction Removing the protection is done by saving a "printed" version of via pikepdf_ References: 1. https://www.reddit.com/r/Python/comments/t32z2o/simple_code_to_unlock_all_readonly_pdfs_in/ 2. https://pikepdf.readthedocs.io/en/latest/ .. _pikepdf: https://pikepdf.readthedocs.io/en/latest/ """
[docs] def __init__(self) -> None: super().__init__()
[docs] def __call__(self, src: str, dst: str, *args: Any, **kwds: Any) -> Any: """ Args: src (str): source file to be processed dst (str): destination to save the processed file Returns: Any: None """ pdf = pikepdf.open(src, allow_overwriting_input=True) pdf.save(dst)
[docs] class FileTransformCompose: """Composes several transforms operating on files together The transforms should be tied to files with keyword and this will be only applying functions on files that match the keyword using a dictionary Transformation dictionary over files in the following structure:: { FileTransform: 'filter_str', ..., } Note: Transforms will be applied in order of the keys in the dictionary """
[docs] def __init__(self, transforms: dict) -> None: """ Args: transforms: a dictionary of transforms, where the key is the instance of FileTransform and the value is the keyword that the transform will be applied to Raises: ValueError: if the keyword is not a string """ if transforms is not None: for k in transforms.keys(): if not issubclass(k.__class__, FileTransform): raise TypeError(f'Keys must be {FileTransform} instance.') self.transforms = transforms
[docs] def __call__(self, src: str, dst: str, *args: Any, **kwds: Any) -> Any: """Applies transforms in order Args: src (str): source file path to be processed dst (str): destination to save the processed file """ for transform, file_filter in self.transforms.items(): if file_filter in src: transform(src, dst)