""" The base for all processors """
from __future__ import annotations
__all__ = [
"BaseProcessor",
]
from abc import ABC, abstractmethod
from functools import partial
from typing import Callable
from maha.cleaners.functions import (
connect_single_letter_word,
contains,
contains_repeated_substring,
contains_single_letter_word,
keep,
normalize,
reduce_repeated_substring,
remove,
replace,
replace_expression,
replace_pairs,
)
from maha.rexy import Expression, ExpressionGroup
from .utils import ObjectGet
[docs]class BaseProcessor(ABC):
"""Base class for all processors. It contains almost all functions needed for the
processors.
Parameters
----------
text : Union[List[str], str]
A text or list of strings to process
"""
@abstractmethod
[docs] def get_lines(self, n_lines: int = 100):
"""Returns a generator of list of strings with length of ``n_lines``
Parameters
----------
n_lines : int
Number of lines to yield, Defaults to 100
Yields
-------
List[str]
List of strings with length of ``n_lines``. The last list maybe of length
less than ``n_lines``.
"""
raise NotImplementedError()
@abstractmethod
[docs] def apply(self, fn: Callable[[str], str]):
"""Applies a function to each line
Parameters
----------
fn :
Function to apply
"""
raise NotImplementedError()
@abstractmethod
[docs] def filter(self, fn: Callable[[str], bool]):
"""Keeps lines for which the input function is True
Parameters
----------
fn :
Function to check
"""
raise NotImplementedError()
[docs] def get(
self,
unique_characters: bool = False,
character_length: bool = False,
word_length: bool = False,
):
"""Returns statistics about the provided text
Parameters
----------
unique_characters : bool, optional
Return all unique characters, by default False
character_length : bool, optional
Return the character length of each string, by default False
word_length : bool, optional
Return the word length of each string (split by space), by default False
Returns
-------
Union[Dict[str, Any], Any]
* If one argument is set to True, its value is return
* If more than one argument is set to True, a dictionary is returned where
keys are the True passed arguments with the corresponding values
"""
objects = []
if unique_characters:
objects.append(
ObjectGet(
func=lambda prev, current: prev | set(current),
prev=set(),
name="unique_characters",
post_fn=list,
)
)
if character_length:
objects.append(
ObjectGet(
func=lambda prev, current: prev + [len(current)],
prev=[],
name="character_length",
)
)
if word_length:
objects.append(
ObjectGet(
func=lambda prev, current: prev + [len(current.split())],
prev=[],
name="word_length",
)
)
for line in self.get_lines(1):
line = line[0]
for obj in objects:
obj.prev = obj.func(obj.prev, line)
output = {}
if len(objects) == 1:
output = objects[0].post_fn(objects[0].prev)
else:
for obj in objects:
output[obj.name] = obj.post_fn(obj.prev)
return output
[docs] def print_unique_characters(self):
"""Prints all unique characters in the text"""
unique = self.get(unique_characters=True)
print(f"{len(unique)} unique characters were found, they are:")
print(unique)
return self
[docs] def keep(
self,
arabic: bool = False,
english: bool = False,
arabic_letters: bool = False,
english_letters: bool = False,
english_small_letters: bool = False,
english_capital_letters: bool = False,
numbers: bool = False,
harakat: bool = False,
all_harakat: bool = False,
punctuations: bool = False,
arabic_numbers: bool = False,
english_numbers: bool = False,
arabic_punctuations: bool = False,
english_punctuations: bool = False,
use_space: bool = True,
custom_strings: list[str] | str | None = None,
):
"""Applies :func:`~.keep` to each line"""
self.apply(partial(keep, **self._arguments_except_self(locals())))
return self
[docs] def normalize(
self,
lam_alef: bool | None = None,
alef: bool | None = None,
waw: bool | None = None,
yeh: bool | None = None,
teh_marbuta: bool | None = None,
ligatures: bool | None = None,
spaces: bool | None = None,
all: bool | None = None,
):
"""Applies :func:`~.normalize` to each line"""
self.apply(partial(normalize, **self._arguments_except_self(locals())))
return self
[docs] def connect_single_letter_word(
self,
waw: bool | None = None,
feh: bool | None = None,
beh: bool | None = None,
lam: bool | None = None,
kaf: bool | None = None,
teh: bool | None = None,
all: bool | None = None,
custom_strings: list[str] | str | None = None,
):
"""Applies :func:`~.connect_single_letter_word` to each line"""
self.apply(
partial(connect_single_letter_word, **self._arguments_except_self(locals()))
)
return self
[docs] def replace(self, strings: list[str] | str, with_value: str):
"""Applies :func:`~.replace` to each line"""
self.apply(partial(replace, **self._arguments_except_self(locals())))
return self
[docs] def replace_expression(
self,
expression: Expression | ExpressionGroup | str,
with_value: Callable[..., str] | str,
):
"""Applies :func:`~.replace_expression` to each line"""
self.apply(partial(replace_expression, **self._arguments_except_self(locals())))
return self
[docs] def replace_pairs(self, keys: list[str], values: list[str]):
"""Applies :func:`~.replace_pairs` to each line"""
self.apply(partial(replace_pairs, **self._arguments_except_self(locals())))
return self
[docs] def reduce_repeated_substring(self, min_repeated: int = 3, reduce_to: int = 2):
"""Applies :func:`~.reduce_repeated_substring` to each line"""
self.apply(
partial(reduce_repeated_substring, **self._arguments_except_self(locals()))
)
return self
[docs] def remove(
self,
arabic: bool = False,
english: bool = False,
arabic_letters: bool = False,
english_letters: bool = False,
english_small_letters: bool = False,
english_capital_letters: bool = False,
numbers: bool = False,
harakat: bool = False,
all_harakat: bool = False,
tatweel: bool = False,
punctuations: bool = False,
arabic_numbers: bool = False,
english_numbers: bool = False,
arabic_punctuations: bool = False,
english_punctuations: bool = False,
arabic_ligatures: bool = False,
arabic_hashtags: bool = False,
arabic_mentions: bool = False,
emails: bool = False,
english_hashtags: bool = False,
english_mentions: bool = False,
hashtags: bool = False,
links: bool = False,
mentions: bool = False,
emojis: bool = False,
use_space: bool = True,
custom_strings: list[str] | str | None = None,
custom_expressions: list[str] | str | None = None,
):
"""Applies :func:`~.remove` to each line"""
self.apply(partial(remove, **self._arguments_except_self(locals())))
return self
[docs] def drop_lines_contain(
self,
arabic: bool = False,
english: bool = False,
arabic_letters: bool = False,
english_letters: bool = False,
english_small_letters: bool = False,
english_capital_letters: bool = False,
numbers: bool = False,
harakat: bool = False,
all_harakat: bool = False,
tatweel: bool = False,
lam_alef_variations: bool = False,
lam_alef: bool = False,
punctuations: bool = False,
arabic_numbers: bool = False,
english_numbers: bool = False,
arabic_punctuations: bool = False,
english_punctuations: bool = False,
arabic_ligatures: bool = False,
persian: bool = False,
arabic_hashtags: bool = False,
arabic_mentions: bool = False,
emails: bool = False,
english_hashtags: bool = False,
english_mentions: bool = False,
hashtags: bool = False,
links: bool = False,
mentions: bool = False,
emojis: bool = False,
custom_strings: list[str] | str | None = None,
custom_expressions: list[str] | str | None = None,
operator: str = "or",
):
"""Drop lines that contain any of the selected strings or patterns.
.. note::
Use ``operator='and'`` to drop lines that contain all selected strings
or patterns.
See :func:`~.contains` for arguments description"""
if operator is None:
raise ValueError("operator cannot be None")
arguments = locals()
self.filter(
lambda text: not contains(text, **self._arguments_except_self(arguments))
)
return self
[docs] def drop_empty_lines(self):
"""Drop empty lines."""
return self.drop_lines_below_len(1)
[docs] def drop_lines_below_len(self, length: int, word_level=False):
"""Drop lines with a number of characters/words less than the input ``length``
Parameters
----------
length : int
Number of characters/words
word_level : bool, optional
True to switch to word level, which splits the text by space,
by default False
"""
self.filter(
lambda line: (len(line.split()) if word_level else len(line)) >= length
)
return self
[docs] def drop_lines_above_len(self, length: int, word_level=False):
"""Drop lines with a number of characters/words more than the input ``length``
Parameters
----------
length : int
Number of characters/words
word_level : bool, optional
True to switch to word level, which splits the text by space,
by default False
"""
filter_fn = (
lambda line: (len(line.split()) if word_level else len(line)) <= length
)
self.filter(filter_fn)
return self
[docs] def drop_lines_contain_repeated_substring(self, repeated=3):
"""Drop lines containing a number of consecutive repeated substrings
Parameters
----------
repeated : int, optional
Minimum number of repetitions, by default 3
"""
self.filter(lambda line: not contains_repeated_substring(line, repeated))
return self
[docs] def drop_lines_contain_single_letter_word(
self,
arabic_letters: bool = False,
english_letters: bool = False,
):
"""Drop lines containing a single-letter word (e.g."محمد و احمد" or
"how r u"). In Arabic, single-letter words are rare.
.. warning::
In English, all lines containing the letter "I" will be dropped since it is
considered a single-letter word
See :func:`~.contains_single_letter_word`.
See also :func:`~.connect_single_letter_word`.
"""
arguments = locals()
self.filter(
lambda text: not contains_single_letter_word(
text, **self._arguments_except_self(arguments)
)
)
return self
[docs] def filter_lines_contain(
self,
arabic: bool = False,
english: bool = False,
arabic_letters: bool = False,
english_letters: bool = False,
english_small_letters: bool = False,
english_capital_letters: bool = False,
numbers: bool = False,
harakat: bool = False,
all_harakat: bool = False,
tatweel: bool = False,
lam_alef_variations: bool = False,
lam_alef: bool = False,
punctuations: bool = False,
arabic_numbers: bool = False,
english_numbers: bool = False,
arabic_punctuations: bool = False,
english_punctuations: bool = False,
arabic_ligatures: bool = False,
persian: bool = False,
arabic_hashtags: bool = False,
arabic_mentions: bool = False,
emails: bool = False,
english_hashtags: bool = False,
english_mentions: bool = False,
hashtags: bool = False,
links: bool = False,
mentions: bool = False,
emojis: bool = False,
custom_strings: list[str] | str | None = None,
custom_expressions: list[str] | str | None = None,
operator: str = "or",
):
"""Keep lines that contain any of the selected strings or patterns.
.. note::
Use ``operator='and'`` to drop lines that contain all selected strings
or patterns.
See :func:`~.contains` for arguments description"""
if operator is None:
raise ValueError("operator cannot be None")
arguments = locals()
self.filter(
lambda text: bool(contains(text, **self._arguments_except_self(arguments)))
)
return self
def _arguments_except_self(self, arguments: dict):
"""Used in combination with local() to return all arguments withoutself"""
return {k: v for k, v in arguments.items() if k not in ["self", "arguments"]}