Source code for maha.cleaners.functions.replace_fn

"""
Functions that operate on a string and replace specific characters with others.
"""
from __future__ import annotations

__all__ = [
    "replace",
    "replace_except",
    "replace_pairs",
    "replace_expression",
    "arabic_numbers_to_english",
    "connect_single_letter_word",
]


from typing import Callable

# To enjoy infinite width lookbehind
import regex as re

from maha.constants import (
    ARABIC_LETTERS,
    ARABIC_NUMBERS,
    BEH,
    EMPTY,
    ENGLISH_NUMBERS,
    FEH,
    KAF,
    LAM,
    SPACE,
    TEH,
    WAW,
)
from maha.rexy import Expression, ExpressionGroup


[docs]def connect_single_letter_word( text: str, waw: bool | None = None, feh: bool | None = None, beh: bool | None = None, lam: bool | None = None, kaf: bool | None = None, teh: bool | None = None, all: bool | None = None, custom_strings: list[str] | str | None = None, ): """Connects single-letter word with the letter following it. Parameters ---------- text : str Text to process waw : bool, optional Connect :data:`.WAW` letter, by default None feh : bool, optional Connect :data:`.FEH` letter, by default None beh : bool, optional Connect :data:`.BEH` letter, by default None lam : bool, optional Connect :data:`.LAM` letter, by default None kaf : bool, optional Connect :data:`.KAF` letter, by default None teh : bool, optional Connect :data:`.TEH` letter, by default None all : bool, optional Connect all letter except the ones set to False, by default None custom_strings : Union[List[str], str], optional Include any other string(s) to connect, by default None """ letters = [] if isinstance(custom_strings, str): custom_strings = [custom_strings] if waw or (all and waw is not False): letters.append(WAW) if feh or (all and feh is not False): letters.append(FEH) if beh or (all and beh is not False): letters.append(BEH) if lam or (all and lam is not False): letters.append(LAM) if kaf or (all and kaf is not False): letters.append(KAF) if teh or (all and teh is not False): letters.append(TEH) if custom_strings: letters.extend(re.escape(s) for s in custom_strings) chars = "|".join(letters) return replace_expression(text, rf"(\b)({chars})(?:\s)(?=.)", r"\1\2")
[docs]def arabic_numbers_to_english(text: str): """Converts Arabic numbers :data:`~.ARABIC_NUMBERS` to the corresponding English numbers :data:`~.ENGLISH_NUMBERS` Parameters ---------- text : str Text to process Returns ------- str Processed text with all occurrences of Arabic numbers converted to English numbers Examples -------- .. code:: pycon >>> from maha.cleaners.functions import arabic_numbers_to_english >>> text = "٣" >>> arabic_numbers_to_english(text) '3' .. code:: pycon >>> from maha.cleaners.functions import arabic_numbers_to_english >>> text = "١٠" >>> arabic_numbers_to_english(text) '10' """ return replace_pairs(text, ARABIC_NUMBERS, ENGLISH_NUMBERS)
[docs]def replace_expression( text: str, expression: Expression | ExpressionGroup | str, with_value: Callable[..., str] | str, ) -> str: """Matches characters from the input text using the given ``expression`` and replaces all matched characters with the given value. Parameters ---------- text : str Text to process expression : Pattern/Expression used to match characters from the text with_value : Value to replace the matched characters with Returns ------- str Processed text Examples -------- .. code:: pycon >>> from maha.cleaners.functions import replace_expression >>> text = "ولقد حصلت على ١٠ من ١٠ " >>> replace_expression(text, "١٠", "عشرة") 'ولقد حصلت على عشرة من عشرة ' .. code:: pycon >>> from maha.cleaners.functions import replace_expression >>> text = "ذهبت الفتاه إلى المدرسه" >>> replace_expression(text, "ه( |$)", "ة ").strip() 'ذهبت الفتاة إلى المدرسة' """ if isinstance(expression, str): expression = Expression(expression) if isinstance(expression, ExpressionGroup): expression = Expression(expression.join()) return expression.sub(with_value, text)
[docs]def replace(text: str, strings: list[str] | str, with_value: str) -> str: """Replaces the input ``strings`` in the given text with the given value Parameters ---------- text : str Text to process strings : Strings to replace with_value : Value to replace the input strings with Returns ------- str Processed text Examples -------- .. code:: pycon >>> from maha.cleaners.functions import replace >>> text = "حصل الولد على معدل 50%" >>> replace(text, "%", " بالمئة") 'حصل الولد على معدل 50 بالمئة' .. code:: pycon >>> from maha.cleaners.functions import replace >>> text = "ولقد كلف هذا المنتج 100 $" >>> replace(text, "$", "دولار") 'ولقد كلف هذا المنتج 100 دولار' """ # convert list to str if isinstance(strings, list): strings = "|".join(str(re.escape(c)) for c in strings) else: strings = str(re.escape(strings)) return replace_expression(text, f"({strings})", with_value)
[docs]def replace_except(text: str, strings: list[str] | str, with_value: str) -> str: """Replaces everything except the input ``strings`` in the given text with the given value Parameters ---------- text : str Text to process strings : Strings to preserve (not replace) with_value : Value to replace all other strings with. Returns ------- str Processed text Example ------- .. code:: pycon >>> from maha.cleaners.functions import replace_except >>> from maha.constants import ARABIC_LETTERS, SPACE, EMPTY >>> text = "لَيتَ الذينَ تُحبُّ العيّنَ رؤيَتهم" >>> replace_except(text, ARABIC_LETTERS + [SPACE], EMPTY) 'ليت الذين تحب العين رؤيتهم' """ # convert list to str if isinstance(strings, list): strings = "|".join(str(re.escape(c)) for c in strings) else: strings = str(re.escape(strings)) # To include the end strings += "|$" return replace_expression( text, f"(.*?)({strings})", lambda m: with_value + m.groups()[1] if m.groups()[0] else m.groups()[1], )
[docs]def replace_pairs(text: str, keys: list[str], values: list[str]) -> str: """Replaces each key with its corresponding value in the given text Parameters ---------- text : str Text to process keys : Strings to be replaced values : Strings to be replaced with Returns ------- str Processed text Raises ------ ValueError If keys and values are of different lengths Example ------- .. code:: pycon >>> from maha.cleaners.functions import replace_pairs >>> text = 'شلونك يا محمد؟' >>> replace_pairs(text, ['شلونك'] , ['كيف حالك']) 'كيف حالك يا محمد؟' """ if len(keys) != len(values): raise ValueError("'keys' and 'values' should have the same length") escaped = [str(re.escape(c)) for c in keys] pattern = "|".join(escaped) def func(match): return values[keys.index(match.group(0))] return replace_expression(text, pattern, func)