Source code for maha.processors.base_processor

""" The base for all processors """
from __future__ import annotations

__all__ = [
    "BaseProcessor",
]


from abc import ABC, abstractmethod
from functools import partial
from typing import Callable

from maha.cleaners.functions import (
    connect_single_letter_word,
    contains,
    contains_repeated_substring,
    contains_single_letter_word,
    keep,
    normalize,
    reduce_repeated_substring,
    remove,
    replace,
    replace_expression,
    replace_pairs,
)
from maha.rexy import Expression, ExpressionGroup

from .utils import ObjectGet


[docs]class BaseProcessor(ABC): """Base class for all processors. It contains almost all functions needed for the processors. Parameters ---------- text : Union[List[str], str] A text or list of strings to process """ @abstractmethod
[docs] def get_lines(self, n_lines: int = 100): """Returns a generator of list of strings with length of ``n_lines`` Parameters ---------- n_lines : int Number of lines to yield, Defaults to 100 Yields ------- List[str] List of strings with length of ``n_lines``. The last list maybe of length less than ``n_lines``. """ raise NotImplementedError()
@abstractmethod
[docs] def apply(self, fn: Callable[[str], str]): """Applies a function to each line Parameters ---------- fn : Function to apply """ raise NotImplementedError()
@abstractmethod
[docs] def filter(self, fn: Callable[[str], bool]): """Keeps lines for which the input function is True Parameters ---------- fn : Function to check """ raise NotImplementedError()
[docs] def get( self, unique_characters: bool = False, character_length: bool = False, word_length: bool = False, ): """Returns statistics about the provided text Parameters ---------- unique_characters : bool, optional Return all unique characters, by default False character_length : bool, optional Return the character length of each string, by default False word_length : bool, optional Return the word length of each string (split by space), by default False Returns ------- Union[Dict[str, Any], Any] * If one argument is set to True, its value is return * If more than one argument is set to True, a dictionary is returned where keys are the True passed arguments with the corresponding values """ objects = [] if unique_characters: objects.append( ObjectGet( func=lambda prev, current: prev | set(current), prev=set(), name="unique_characters", post_fn=list, ) ) if character_length: objects.append( ObjectGet( func=lambda prev, current: prev + [len(current)], prev=[], name="character_length", ) ) if word_length: objects.append( ObjectGet( func=lambda prev, current: prev + [len(current.split())], prev=[], name="word_length", ) ) for line in self.get_lines(1): line = line[0] for obj in objects: obj.prev = obj.func(obj.prev, line) output = {} if len(objects) == 1: output = objects[0].post_fn(objects[0].prev) else: for obj in objects: output[obj.name] = obj.post_fn(obj.prev) return output
[docs] def print_unique_characters(self): """Prints all unique characters in the text""" unique = self.get(unique_characters=True) print(f"{len(unique)} unique characters were found, they are:") print(unique) return self
[docs] def keep( self, arabic: bool = False, english: bool = False, arabic_letters: bool = False, english_letters: bool = False, english_small_letters: bool = False, english_capital_letters: bool = False, numbers: bool = False, harakat: bool = False, all_harakat: bool = False, punctuations: bool = False, arabic_numbers: bool = False, english_numbers: bool = False, arabic_punctuations: bool = False, english_punctuations: bool = False, use_space: bool = True, custom_strings: list[str] | str | None = None, ): """Applies :func:`~.keep` to each line""" self.apply(partial(keep, **self._arguments_except_self(locals()))) return self
[docs] def normalize( self, lam_alef: bool | None = None, alef: bool | None = None, waw: bool | None = None, yeh: bool | None = None, teh_marbuta: bool | None = None, ligatures: bool | None = None, spaces: bool | None = None, all: bool | None = None, ): """Applies :func:`~.normalize` to each line""" self.apply(partial(normalize, **self._arguments_except_self(locals()))) return self
[docs] def connect_single_letter_word( self, waw: bool | None = None, feh: bool | None = None, beh: bool | None = None, lam: bool | None = None, kaf: bool | None = None, teh: bool | None = None, all: bool | None = None, custom_strings: list[str] | str | None = None, ): """Applies :func:`~.connect_single_letter_word` to each line""" self.apply( partial(connect_single_letter_word, **self._arguments_except_self(locals())) ) return self
[docs] def replace(self, strings: list[str] | str, with_value: str): """Applies :func:`~.replace` to each line""" self.apply(partial(replace, **self._arguments_except_self(locals()))) return self
[docs] def replace_expression( self, expression: Expression | ExpressionGroup | str, with_value: Callable[..., str] | str, ): """Applies :func:`~.replace_expression` to each line""" self.apply(partial(replace_expression, **self._arguments_except_self(locals()))) return self
[docs] def replace_pairs(self, keys: list[str], values: list[str]): """Applies :func:`~.replace_pairs` to each line""" self.apply(partial(replace_pairs, **self._arguments_except_self(locals()))) return self
[docs] def reduce_repeated_substring(self, min_repeated: int = 3, reduce_to: int = 2): """Applies :func:`~.reduce_repeated_substring` to each line""" self.apply( partial(reduce_repeated_substring, **self._arguments_except_self(locals())) ) return self
[docs] def remove( self, arabic: bool = False, english: bool = False, arabic_letters: bool = False, english_letters: bool = False, english_small_letters: bool = False, english_capital_letters: bool = False, numbers: bool = False, harakat: bool = False, all_harakat: bool = False, tatweel: bool = False, punctuations: bool = False, arabic_numbers: bool = False, english_numbers: bool = False, arabic_punctuations: bool = False, english_punctuations: bool = False, arabic_ligatures: bool = False, arabic_hashtags: bool = False, arabic_mentions: bool = False, emails: bool = False, english_hashtags: bool = False, english_mentions: bool = False, hashtags: bool = False, links: bool = False, mentions: bool = False, emojis: bool = False, use_space: bool = True, custom_strings: list[str] | str | None = None, custom_expressions: list[str] | str | None = None, ): """Applies :func:`~.remove` to each line""" self.apply(partial(remove, **self._arguments_except_self(locals()))) return self
[docs] def drop_lines_contain( self, arabic: bool = False, english: bool = False, arabic_letters: bool = False, english_letters: bool = False, english_small_letters: bool = False, english_capital_letters: bool = False, numbers: bool = False, harakat: bool = False, all_harakat: bool = False, tatweel: bool = False, lam_alef_variations: bool = False, lam_alef: bool = False, punctuations: bool = False, arabic_numbers: bool = False, english_numbers: bool = False, arabic_punctuations: bool = False, english_punctuations: bool = False, arabic_ligatures: bool = False, persian: bool = False, arabic_hashtags: bool = False, arabic_mentions: bool = False, emails: bool = False, english_hashtags: bool = False, english_mentions: bool = False, hashtags: bool = False, links: bool = False, mentions: bool = False, emojis: bool = False, custom_strings: list[str] | str | None = None, custom_expressions: list[str] | str | None = None, operator: str = "or", ): """Drop lines that contain any of the selected strings or patterns. .. note:: Use ``operator='and'`` to drop lines that contain all selected strings or patterns. See :func:`~.contains` for arguments description""" if operator is None: raise ValueError("operator cannot be None") arguments = locals() self.filter( lambda text: not contains(text, **self._arguments_except_self(arguments)) ) return self
[docs] def drop_empty_lines(self): """Drop empty lines.""" return self.drop_lines_below_len(1)
[docs] def drop_lines_below_len(self, length: int, word_level=False): """Drop lines with a number of characters/words less than the input ``length`` Parameters ---------- length : int Number of characters/words word_level : bool, optional True to switch to word level, which splits the text by space, by default False """ self.filter( lambda line: (len(line.split()) if word_level else len(line)) >= length ) return self
[docs] def drop_lines_above_len(self, length: int, word_level=False): """Drop lines with a number of characters/words more than the input ``length`` Parameters ---------- length : int Number of characters/words word_level : bool, optional True to switch to word level, which splits the text by space, by default False """ filter_fn = ( lambda line: (len(line.split()) if word_level else len(line)) <= length ) self.filter(filter_fn) return self
[docs] def drop_lines_contain_repeated_substring(self, repeated=3): """Drop lines containing a number of consecutive repeated substrings Parameters ---------- repeated : int, optional Minimum number of repetitions, by default 3 """ self.filter(lambda line: not contains_repeated_substring(line, repeated)) return self
[docs] def drop_lines_contain_single_letter_word( self, arabic_letters: bool = False, english_letters: bool = False, ): """Drop lines containing a single-letter word (e.g."محمد و احمد" or "how r u"). In Arabic, single-letter words are rare. .. warning:: In English, all lines containing the letter "I" will be dropped since it is considered a single-letter word See :func:`~.contains_single_letter_word`. See also :func:`~.connect_single_letter_word`. """ arguments = locals() self.filter( lambda text: not contains_single_letter_word( text, **self._arguments_except_self(arguments) ) ) return self
[docs] def filter_lines_contain( self, arabic: bool = False, english: bool = False, arabic_letters: bool = False, english_letters: bool = False, english_small_letters: bool = False, english_capital_letters: bool = False, numbers: bool = False, harakat: bool = False, all_harakat: bool = False, tatweel: bool = False, lam_alef_variations: bool = False, lam_alef: bool = False, punctuations: bool = False, arabic_numbers: bool = False, english_numbers: bool = False, arabic_punctuations: bool = False, english_punctuations: bool = False, arabic_ligatures: bool = False, persian: bool = False, arabic_hashtags: bool = False, arabic_mentions: bool = False, emails: bool = False, english_hashtags: bool = False, english_mentions: bool = False, hashtags: bool = False, links: bool = False, mentions: bool = False, emojis: bool = False, custom_strings: list[str] | str | None = None, custom_expressions: list[str] | str | None = None, operator: str = "or", ): """Keep lines that contain any of the selected strings or patterns. .. note:: Use ``operator='and'`` to drop lines that contain all selected strings or patterns. See :func:`~.contains` for arguments description""" if operator is None: raise ValueError("operator cannot be None") arguments = locals() self.filter( lambda text: bool(contains(text, **self._arguments_except_self(arguments))) ) return self
def _arguments_except_self(self, arguments: dict): """Used in combination with local() to return all arguments withoutself""" return {k: v for k, v in arguments.items() if k not in ["self", "arguments"]}