maha.processors.base_processor#
The base for all processors
Module Contents#
Classes#
Base class for all processors. It contains almost all functions needed for the |
- class BaseProcessor[source]#
Bases:
abc.ABCBase class for all processors. It contains almost all functions needed for the processors.
- Parameters
text (Union[List[str], str]) – A text or list of strings to process
- abstract get_lines(self, n_lines=100)[source]#
Returns a generator of list of strings with length of
n_lines- Parameters
n_lines (int) – Number of lines to yield, Defaults to 100
- Yields
List[str] – List of strings with length of
n_lines. The last list maybe of length less thann_lines.
- abstract apply(self, fn)[source]#
Applies a function to each line
- Parameters
fn (Callable[[str], str]) – Function to apply
- abstract filter(self, fn)[source]#
Keeps lines for which the input function is True
- Parameters
fn (Callable[[str], bool]) – Function to check
- get(self, unique_characters=False, character_length=False, word_length=False)[source]#
Returns statistics about the provided text
- Parameters
unique_characters (bool, optional) – Return all unique characters, by default False
character_length (bool, optional) – Return the character length of each string, by default False
word_length (bool, optional) – Return the word length of each string (split by space), by default False
- Returns
If one argument is set to True, its value is return
- If more than one argument is set to True, a dictionary is returned where
keys are the True passed arguments with the corresponding values
- Return type
Union[Dict[str, Any], Any]
- keep(self, arabic=False, english=False, arabic_letters=False, english_letters=False, english_small_letters=False, english_capital_letters=False, numbers=False, harakat=False, all_harakat=False, punctuations=False, arabic_numbers=False, english_numbers=False, arabic_punctuations=False, english_punctuations=False, use_space=True, custom_strings=None)[source]#
Applies
keep()to each line- Parameters
arabic (bool) –
english (bool) –
arabic_letters (bool) –
english_letters (bool) –
english_small_letters (bool) –
english_capital_letters (bool) –
numbers (bool) –
harakat (bool) –
all_harakat (bool) –
punctuations (bool) –
arabic_numbers (bool) –
english_numbers (bool) –
arabic_punctuations (bool) –
english_punctuations (bool) –
use_space (bool) –
custom_strings (list[str] | str | None) –
- normalize(self, lam_alef=None, alef=None, waw=None, yeh=None, teh_marbuta=None, ligatures=None, spaces=None, all=None)[source]#
Applies
normalize()to each line- Parameters
lam_alef (bool | None) –
alef (bool | None) –
waw (bool | None) –
yeh (bool | None) –
teh_marbuta (bool | None) –
ligatures (bool | None) –
spaces (bool | None) –
all (bool | None) –
- connect_single_letter_word(self, waw=None, feh=None, beh=None, lam=None, kaf=None, teh=None, all=None, custom_strings=None)[source]#
Applies
connect_single_letter_word()to each line- Parameters
waw (bool | None) –
feh (bool | None) –
beh (bool | None) –
lam (bool | None) –
kaf (bool | None) –
teh (bool | None) –
all (bool | None) –
custom_strings (list[str] | str | None) –
- replace(self, strings, with_value)[source]#
Applies
replace()to each line- Parameters
strings (list[str] | str) –
with_value (str) –
- replace_expression(self, expression, with_value)[source]#
Applies
replace_expression()to each line- Parameters
expression (Expression | ExpressionGroup | str) –
with_value (Callable[..., str] | str) –
- replace_pairs(self, keys, values)[source]#
Applies
replace_pairs()to each line- Parameters
keys (list[str]) –
values (list[str]) –
- reduce_repeated_substring(self, min_repeated=3, reduce_to=2)[source]#
Applies
reduce_repeated_substring()to each line- Parameters
min_repeated (int) –
reduce_to (int) –
- remove(self, arabic=False, english=False, arabic_letters=False, english_letters=False, english_small_letters=False, english_capital_letters=False, numbers=False, harakat=False, all_harakat=False, tatweel=False, punctuations=False, arabic_numbers=False, english_numbers=False, arabic_punctuations=False, english_punctuations=False, arabic_ligatures=False, arabic_hashtags=False, arabic_mentions=False, emails=False, english_hashtags=False, english_mentions=False, hashtags=False, links=False, mentions=False, emojis=False, use_space=True, custom_strings=None, custom_expressions=None)[source]#
Applies
remove()to each line- Parameters
arabic (bool) –
english (bool) –
arabic_letters (bool) –
english_letters (bool) –
english_small_letters (bool) –
english_capital_letters (bool) –
numbers (bool) –
harakat (bool) –
all_harakat (bool) –
tatweel (bool) –
punctuations (bool) –
arabic_numbers (bool) –
english_numbers (bool) –
arabic_punctuations (bool) –
english_punctuations (bool) –
arabic_ligatures (bool) –
arabic_hashtags (bool) –
arabic_mentions (bool) –
emails (bool) –
english_hashtags (bool) –
english_mentions (bool) –
hashtags (bool) –
links (bool) –
mentions (bool) –
emojis (bool) –
use_space (bool) –
custom_strings (list[str] | str | None) –
custom_expressions (list[str] | str | None) –
- drop_lines_contain(self, arabic=False, english=False, arabic_letters=False, english_letters=False, english_small_letters=False, english_capital_letters=False, numbers=False, harakat=False, all_harakat=False, tatweel=False, lam_alef_variations=False, lam_alef=False, punctuations=False, arabic_numbers=False, english_numbers=False, arabic_punctuations=False, english_punctuations=False, arabic_ligatures=False, persian=False, arabic_hashtags=False, arabic_mentions=False, emails=False, english_hashtags=False, english_mentions=False, hashtags=False, links=False, mentions=False, emojis=False, custom_strings=None, custom_expressions=None, operator='or')[source]#
Drop lines that contain any of the selected strings or patterns.
Note
Use
operator='and'to drop lines that contain all selected strings or patterns.See
contains()for arguments description- Parameters
arabic (bool) –
english (bool) –
arabic_letters (bool) –
english_letters (bool) –
english_small_letters (bool) –
english_capital_letters (bool) –
numbers (bool) –
harakat (bool) –
all_harakat (bool) –
tatweel (bool) –
lam_alef_variations (bool) –
lam_alef (bool) –
punctuations (bool) –
arabic_numbers (bool) –
english_numbers (bool) –
arabic_punctuations (bool) –
english_punctuations (bool) –
arabic_ligatures (bool) –
persian (bool) –
arabic_hashtags (bool) –
arabic_mentions (bool) –
emails (bool) –
english_hashtags (bool) –
english_mentions (bool) –
hashtags (bool) –
links (bool) –
mentions (bool) –
emojis (bool) –
custom_strings (list[str] | str | None) –
custom_expressions (list[str] | str | None) –
operator (str) –
- drop_lines_below_len(self, length, word_level=False)[source]#
Drop lines with a number of characters/words less than the input
length- Parameters
length (int) – Number of characters/words
word_level (bool, optional) – True to switch to word level, which splits the text by space, by default False
- drop_lines_above_len(self, length, word_level=False)[source]#
Drop lines with a number of characters/words more than the input
length- Parameters
length (int) – Number of characters/words
word_level (bool, optional) – True to switch to word level, which splits the text by space, by default False
- drop_lines_contain_repeated_substring(self, repeated=3)[source]#
Drop lines containing a number of consecutive repeated substrings
- Parameters
repeated (int, optional) – Minimum number of repetitions, by default 3
- drop_lines_contain_single_letter_word(self, arabic_letters=False, english_letters=False)[source]#
Drop lines containing a single-letter word (e.g.”محمد و احمد” or “how r u”). In Arabic, single-letter words are rare.
Warning
In English, all lines containing the letter “I” will be dropped since it is considered a single-letter word
See
contains_single_letter_word(). See alsoconnect_single_letter_word().- Parameters
arabic_letters (bool) –
english_letters (bool) –
- filter_lines_contain(self, arabic=False, english=False, arabic_letters=False, english_letters=False, english_small_letters=False, english_capital_letters=False, numbers=False, harakat=False, all_harakat=False, tatweel=False, lam_alef_variations=False, lam_alef=False, punctuations=False, arabic_numbers=False, english_numbers=False, arabic_punctuations=False, english_punctuations=False, arabic_ligatures=False, persian=False, arabic_hashtags=False, arabic_mentions=False, emails=False, english_hashtags=False, english_mentions=False, hashtags=False, links=False, mentions=False, emojis=False, custom_strings=None, custom_expressions=None, operator='or')[source]#
Keep lines that contain any of the selected strings or patterns.
Note
Use
operator='and'to drop lines that contain all selected strings or patterns.See
contains()for arguments description- Parameters
arabic (bool) –
english (bool) –
arabic_letters (bool) –
english_letters (bool) –
english_small_letters (bool) –
english_capital_letters (bool) –
numbers (bool) –
harakat (bool) –
all_harakat (bool) –
tatweel (bool) –
lam_alef_variations (bool) –
lam_alef (bool) –
punctuations (bool) –
arabic_numbers (bool) –
english_numbers (bool) –
arabic_punctuations (bool) –
english_punctuations (bool) –
arabic_ligatures (bool) –
persian (bool) –
arabic_hashtags (bool) –
arabic_mentions (bool) –
emails (bool) –
english_hashtags (bool) –
english_mentions (bool) –
hashtags (bool) –
links (bool) –
mentions (bool) –
emojis (bool) –
custom_strings (list[str] | str | None) –
custom_expressions (list[str] | str | None) –
operator (str) –