Source code for maha.processors.basic_processors

""" All basic processors """

from __future__ import annotations

__all__ = [
    "TextProcessor",
    "FileProcessor",
]


import pathlib
from typing import Callable

from .base_processor import BaseProcessor


[docs]class TextProcessor(BaseProcessor):
    """For processing text input.

    Parameters
    ----------
    text : Union[List[str], str]
        A text or list of strings to process
    """

    def __init__(self, text: list[str] | str) -> None:
        self.set_lines(text)

[docs]    def apply(self, fn: Callable[[str], str]):
        self.lines: list[str] = list(map(fn, self.lines))

[docs]    def filter(self, fn: Callable[[str], bool]):
        self.lines = list(filter(fn, self.lines))

[docs]    def get_lines(self, n_lines: int = 100):
        for i in range(0, len(self.lines), n_lines):
            yield self.lines[i : i + n_lines]

[docs]    def set_lines(self, text: list[str] | str):
        """Overrides text

        Parameters
        ----------
        text : Union[List[str], str]
            New text or list of strings
        """
        self.lines = []
        if isinstance(text, str):
            self.lines = [text]
        else:
            self.lines.extend(text)

    @property
[docs]    def text(self) -> str:
        """Returns the processed text joined by the newline separator ``\\n``

        Returns
        -------
        str
            processed text
        """
        return "\n".join(self.lines)

    @classmethod
[docs]    def from_text(cls, text: str, sep: str | None = None):
        """Creates a new processor from the given text. Separate the text by the input
        ``sep`` argument if provided.

        Parameters
        ----------
        text : str
            Text to process
        sep : str, optional
            Separator used to split the given text, by default None

        Returns
        -------
        TextProcessor
            New text processor
        """
        if sep:
            return TextProcessor(text.split(sep))

        return TextProcessor(text)

    @classmethod
[docs]    def from_list(cls, lines: list[str]):
        """Creates a new processor from the given list of strings.

        Parameters
        ----------
        lines : List[str]
            list of strings

        Returns
        -------
        TextProcessor
            New text processor
        """
        return TextProcessor(lines)

[docs]    def drop_duplicates(self):
        """Drops duplicate lines from text"""
        self.lines = list(dict.fromkeys(self.lines))
        return self


[docs]class FileProcessor(TextProcessor):
    """For processing file input.

    .. note::
        For large files (>100 MB), use :class:`~.StreamFileProcessor`.

    Parameters
    ----------
    path : Union[str, :obj:`pathlib.Path`]
        Path of the file to process.

    Raises
    ------
    FileNotFoundError
        If the file doesn't exist.
    ValueError
        If the file is empty.
    """

    def __init__(self, path: str | pathlib.Path) -> None:

        if isinstance(path, str):
            path = pathlib.Path(path)

        if not path.is_file():
            raise FileNotFoundError(f"{str(path)} doesn't exist.")

        with path.open("r", encoding="utf8") as f:
            text = f.read()

        if not text:
            raise ValueError("File empty.")

        super().__init__(text.split("\n"))


class DataFrameProcessor:
    def __init__(self):
        raise NotImplementedError()


class FolderProcessor:
    def __init__(self):
        raise NotImplementedError()