"""
Functions that operate on a string and check for values contained in it
"""
from __future__ import annotations
__all__ = [
"contains",
"contains_expressions",
"contain_strings",
"contains_repeated_substring",
"contains_single_letter_word",
]
import regex as re
from maha.constants import (
ALL_HARAKAT,
ARABIC,
ARABIC_LETTERS,
ARABIC_LIGATURES,
ARABIC_NUMBERS,
ARABIC_PUNCTUATIONS,
EMPTY,
ENGLISH,
ENGLISH_CAPITAL_LETTERS,
ENGLISH_LETTERS,
ENGLISH_NUMBERS,
ENGLISH_PUNCTUATIONS,
ENGLISH_SMALL_LETTERS,
HARAKAT,
LAM_ALEF,
LAM_ALEF_VARIATIONS,
NUMBERS,
PERSIAN,
PUNCTUATIONS,
SPACE,
TATWEEL,
)
from maha.expressions import (
EXPRESSION_ARABIC_HASHTAGS,
EXPRESSION_ARABIC_MENTIONS,
EXPRESSION_EMAILS,
EXPRESSION_EMOJIS,
EXPRESSION_ENGLISH_HASHTAGS,
EXPRESSION_ENGLISH_MENTIONS,
EXPRESSION_HASHTAGS,
EXPRESSION_LINKS,
EXPRESSION_MENTIONS,
)
from maha.rexy import Expression, ExpressionGroup
from maha.utils import check_positive_integer
[docs]def contains(
text: str,
arabic: bool = False,
english: bool = False,
arabic_letters: bool = False,
english_letters: bool = False,
english_small_letters: bool = False,
english_capital_letters: bool = False,
numbers: bool = False,
harakat: bool = False,
all_harakat: bool = False,
tatweel: bool = False,
lam_alef_variations: bool = False,
lam_alef: bool = False,
punctuations: bool = False,
arabic_numbers: bool = False,
english_numbers: bool = False,
arabic_punctuations: bool = False,
english_punctuations: bool = False,
arabic_ligatures: bool = False,
persian: bool = False,
arabic_hashtags: bool = False,
arabic_mentions: bool = False,
emails: bool = False,
english_hashtags: bool = False,
english_mentions: bool = False,
hashtags: bool = False,
links: bool = False,
mentions: bool = False,
emojis: bool = False,
custom_strings: list[str] | str | None = None,
custom_expressions: ExpressionGroup | Expression | None = None,
operator: str | None = None,
) -> dict[str, bool] | bool:
"""Check for certain characters, strings or patterns in the given text.
To add a new parameter, make sure that its name is the same as the corresponding
constant. For the patterns, only remove the prefix ``EXPRESSION_`` from the parameter name
Parameters
----------
text : str
Text to check
arabic : bool, optional
Check for :data:`~.ARABIC` characters, by default False
english : bool, optional
Check for :data:`~.ENGLISH` characters, by default False
arabic_letters : bool, optional
Check for :data:`~.ARABIC_LETTERS` characters, by default False
english_letters : bool, optional
Check for :data:`~.ENGLISH_LETTERS` characters, by default False
english_small_letters : bool, optional
Check for :data:`~.ENGLISH_SMALL_LETTERS` characters, by default False
english_capital_letters : bool, optional
Check for :data:`~.ENGLISH_CAPITAL_LETTERS` characters, by default False
numbers : bool, optional
Check for :data:`~.NUMBERS` characters, by default False
harakat : bool, optional
Check for :data:`~.HARAKAT` characters, by default False
all_harakat : bool, optional
Check for :data:`~.ALL_HARAKAT` characters, by default False
tatweel : bool, optional
Check for :data:`~.TATWEEL` character, by default False
lam_alef_variations : bool, optional
Check for :data:`~.LAM_ALEF_VARIATIONS` characters, by default False
lam_alef : bool, optional
Check for :data:`~.LAM_ALEF` character, by default False
punctuations : bool, optional
Check for :data:`~.PUNCTUATIONS` characters, by default False
arabic_numbers : bool, optional
Check for :data:`~.ARABIC_NUMBERS` characters, by default False
english_numbers : bool, optional
Check for :data:`~.ENGLISH_NUMBERS` characters, by default False
arabic_punctuations : bool, optional
Check for :data:`~.ARABIC_PUNCTUATIONS` characters, by default False
english_punctuations : bool, optional
Check for :data:`~.ENGLISH_PUNCTUATIONS` characters, by default False
arabic_ligatures : bool, optional
Check for :data:`~.ARABIC_LIGATURES` words, by default False
persian : bool, optional
Check for :data:`~.PERSIAN` characters, by default False
arabic_hashtags : bool, optional
Check for Arabic hashtags using the expression :data:`~.EXPRESSION_ARABIC_HASHTAGS`,
by default False
arabic_mentions : bool, optional
Check for Arabic mentions using the expression :data:`~.EXPRESSION_ARABIC_MENTIONS`,
by default False
emails : bool, optional
Check for Arabic hashtags using the expression :data:`~.EXPRESSION_EMAILS`,
by default False
english_hashtags : bool, optional
Check for Arabic hashtags using the expression :data:`~.EXPRESSION_ENGLISH_HASHTAGS`,
by default False
english_mentions : bool, optional
Check for Arabic hashtags using the expression :data:`~.EXPRESSION_ENGLISH_MENTIONS`,
by default False
hashtags : bool, optional
Check for Arabic hashtags using the expression :data:`~.EXPRESSION_HASHTAGS`,
by default False
links : bool, optional
Check for Arabic hashtags using the expression :data:`~.EXPRESSION_LINKS`,
by default False
mentions : bool, optional
Check for Arabic hashtags using the expression :data:`~.EXPRESSION_MENTIONS`,
by default False
emojis : bool, optional
Check for emojis using the expression :data:`~.EXPRESSION_EMOJIS`,
by default False
custom_strings : Union[List[str], str], optional
Include any other string(s), by default None
custom_expressions :
Include any other expressions, by default None
operator : bool, optional
When multiple arguments are set to True, this operator is used to combine
the output into a boolean. Takes 'and' or 'or', by default None
Returns
-------
Union[Dict[str, bool], bool]
* If one argument is set to True, a boolean value is returned. True if the text
contains it, False otherwise.
* If ``operator`` is set and more than one argument is set to True, a boolean
value that combines the result with the "and/or" operator is returned.
* If more than one argument is set to True, a dictionary is returned where
keys are the True passed arguments and the corresponding values are
booleans. True if the text contains the argument, False otherwise.
Raises
------
ValueError
If no argument is set to True
Examples
--------
.. code:: pycon
>>> from maha.cleaners.functions import contains
>>> text = "مقاييس أداء النماذج في التعلم الآلي Machine Learning ... 🌺"
>>> contains(text, english=True, emails=True, emojis=True)
{'english': True, 'emails': False, 'emojis': True}
.. code:: pycon
>>> from maha.cleaners.functions import contains
>>> text = "قال رسول اللهﷺ إن خير أيامكم يوم الجمعة فأكثروا عليَّ من الصلاة فيه"
>>> contains(text, english=True)
False
"""
if not text:
return False
if operator is not None and operator not in ["or", "and"]:
raise ValueError("`operator` can only take 'and' or 'or'")
custom_strings = custom_strings or []
custom_expressions = custom_expressions or ExpressionGroup()
# current function arguments
current_arguments = locals()
constants = globals()
output = {}
# Since each argument has the same name as the corresponding constant
# (But, expressions should be prefixed with "EXPRESSION_" to match the actual expression.)
# Looping through all arguments and checking for constants that correspond to the
# True arguments can work
# TODO: Maybe find a good pythonic way to do this
for arg, value in current_arguments.items():
const = constants.get(arg.upper())
if const and value is True:
output[arg] = contain_strings(text, const)
continue
# check for expression
expression = constants.get("EXPRESSION_" + arg.upper())
if expression and value is True:
output[arg] = contains_expressions(text, expression)
if custom_strings:
output["custom_strings"] = contain_strings(text, custom_strings)
if custom_expressions:
output["custom_expressions"] = contains_expressions(text, custom_expressions)
if not output:
raise ValueError("At least one argument should be True")
if len(output) == 1:
return list(output.values())[0]
elif operator == "and":
return all(list(output.values()))
elif operator == "or":
return any(list(output.values()))
return output
[docs]def contains_repeated_substring(text: str, min_repeated: int = 3) -> bool:
"""Check for consecutive substrings that are repeated at least ``min_repeated``
times. For example with the default arguments, the text 'hhhhhh' should return True
Parameters
----------
text : str
Text to check
min_repeated : int, optional
Minimum number of consecutive repeated substring to consider, by default 3
Returns
-------
bool
True if the input text contains consecutive substrings, otherwise False
Raises
------
ValueError
If non positive integer is passed
Example
-------
.. code:: pycon
>>> from maha.cleaners.functions import contains_repeated_substring
>>> text = "كانت اللعبة حللللللللوة جداً"
>>> contains_repeated_substring(text)
True
"""
check_positive_integer(min_repeated, "min_repeated")
pattern = r"(.+?)\1{}".format(f"{{{min_repeated-1},}}")
return contains_expressions(text, pattern)
[docs]def contains_single_letter_word(
text: str,
arabic_letters: bool = False,
english_letters: bool = False,
):
"""Check for a single-letter word. For example, "how r u" should return True if
``english_letters`` is set to True because it contains two single-letter word,
"r" and "u".
Parameters
----------
text : str
Text to check
arabic_letters : bool, optional
Check for all :data:`~.ARABIC_LETTERS`, by default False
english_letters : bool, optional
Check for all :data:`~.ENGLISH_LETTERS`, by default False
Returns
-------
bool
True if the input text contains single-letter word, False otherwise
Raises
------
ValueError
If no argument is set to True
Example
-------
.. code:: pycon
>>> from maha.cleaners.functions import contains_single_letter_word
>>> text = "cu later my friend, ك"
>>> contains_single_letter_word(text, arabic_letters=True, english_letters=True)
True
"""
letters = []
if arabic_letters:
letters += ARABIC_LETTERS
if english_letters:
letters += ENGLISH_LETTERS
if not letters:
raise ValueError("At least one argument should be True")
pattern = r"\b[{}]\b".format("".join(letters))
return contains_expressions(text, pattern)
[docs]def contains_expressions(
text: str, expressions: ExpressionGroup | Expression | str
) -> bool:
r"""Check for matched strings in the given ``text`` using the input ``expressions``
.. note::
Use lookahead/lookbehind when substrings should not be captured or removed.
Parameters
----------
text : str
Text to check
expressions : Union[:class:`~.ExpressionGroup`, :class:`~.Expression`, str]
Expression(s) to use
Returns
-------
bool
True if the pattern is found in the given text, False otherwise.
Raises
------
ValueError
If ``expressions`` are not of type :class:`~.Expression`, :class:`~.ExpressionGroup`
or str
Example
-------
.. code:: pycon
>>> from maha.cleaners.functions import contains_expressions
>>> text = "علم الهندسة (Engineering)"
>>> contains_expressions(text, r"\([A-Za-z]+\)")
True
"""
if isinstance(expressions, ExpressionGroup):
return any(contains_expressions(text, expr) for expr in expressions)
if isinstance(expressions, Expression):
return bool(expressions.search(text))
if isinstance(expressions, str):
return bool(Expression(expressions).search(text))
raise ValueError("'expressions' must be of type Expression, ExpressionGroup or str")
[docs]def contain_strings(text: str, strings: list[str] | str) -> bool:
"""Check for the input ``strings`` in the given ``text``
Parameters
----------
text : str
Text to check
strings : Union[List[str], str]
String or list of strings to check for
Returns
-------
bool
True if the input string(s) are found in the text, False otherwise
Raises
------
ValueError
If no ``strings`` are provided
Example
-------
.. code:: pycon
>>> from maha.cleaners.functions import contain_strings
>>> text = "الله أكبر، الحمد لله رب العالمين"
>>> contain_strings(text, "الله")
True
"""
if not strings:
raise ValueError("'strings cannot be empty.")
# convert list to str
if isinstance(strings, list):
strings = "|".join(str(re.escape(c)) for c in strings)
else:
strings = str(re.escape(strings))
expression = Expression(f"({strings})")
return contains_expressions(text, expression)