Source code for maha.parsers.rules.common

from __future__ import annotations

__all__ = [
    "get_fractions_of_unit_pattern",
    "get_fractions_of_pattern",
    "wrap_pattern",
    "spaced_patterns",
    "THIRD",
    "QUARTER",
    "HALF",
    "THREE_QUARTERS",
    "WAW_CONNECTOR",
    "WORD_SEPARATOR",
    "ALL_ALEF",
    "TWO_SUFFIX",
    "SUM_SUFFIX",
    "EXPRESSION_START",
    "EXPRESSION_END",
    "FRACTIONS",
    "TEH_OPTIONAL_SUFFIX",
    "AFTER",
    "BEFORE",
    "PREVIOUS",
    "NEXT",
    "AFTER_NEXT",
    "BEFORE_PREVIOUS",
    "IN_FROM_AT",
    "FROM",
    "TO",
]


from dataclasses import dataclass

from maha.constants import ALEF_VARIATIONS, ARABIC_COMMA, COMMA, LAM, WAW
from maha.expressions import EXPRESSION_SPACE, EXPRESSION_SPACE_OR_NONE
from maha.parsers.templates import Unit, Value
from maha.rexy import (
    Expression,
    ExpressionGroup,
    non_capturing_group,
    optional_non_capturing_group,
    positive_lookahead,
    positive_lookbehind,
)


@dataclass
class ValueUnit:
    """Represents a value with unit."""

    value: float
    unit: Unit


[docs]def get_fractions_of_unit_pattern(unit: str) -> str: """ Returns the fractions of a unit pattern. Parameters ---------- unit: str The unit pattern. Returns ------- str Pattern for the fractions of the unit. """ return non_capturing_group( spaced_patterns(unit, THREE_QUARTERS), spaced_patterns(unit, TWO_THIRDS), spaced_patterns(HALF, unit), spaced_patterns(THIRD, unit), spaced_patterns(QUARTER, unit), )
[docs]def get_fractions_of_pattern(pattern: str) -> str: """ Returns the fractions of a pattern. Parameters ---------- pattern: str The pattern. Returns ------- str Pattern for the fractions of the input pattern. """ return non_capturing_group( spaced_patterns( pattern, optional_non_capturing_group(WAW + EXPRESSION_SPACE_OR_NONE) + THREE_QUARTERS, ), spaced_patterns( pattern, optional_non_capturing_group(WAW + EXPRESSION_SPACE_OR_NONE) + TWO_THIRDS, ), spaced_patterns( pattern, optional_non_capturing_group(WAW + EXPRESSION_SPACE_OR_NONE) + HALF ), spaced_patterns( pattern, optional_non_capturing_group(WAW + EXPRESSION_SPACE_OR_NONE) + THIRD, ), spaced_patterns( pattern, optional_non_capturing_group(WAW + EXPRESSION_SPACE_OR_NONE) + QUARTER, ), )
[docs]def wrap_pattern(pattern: str) -> str: """Adds start and end expression to the pattern.""" return EXPRESSION_START + pattern + EXPRESSION_END
[docs]def spaced_patterns(*patterns) -> str: """ Returns a regex pattern that matches any of the given patterns, separated by spaces. Parameters ---------- patterns The patterns to match. """ return non_capturing_group(str(EXPRESSION_SPACE).join(str(p) for p in patterns))
def combine_patterns( *patterns: str | Expression, seperator: Expression | None = None, combine_all=False, ) -> str: """ Intelligently combine following input patterns. Parameters ---------- patterns : The patterns to combine. seperator : The seperator to use. If None, the default seperator :data:`WORD_SEPARATOR` is used. combine_all : If True, the start matches any of the input patterns. If False, the start matches the first pattern only, followed by any combination of all other patterns including the first pattern. Returns ------- str The combined pattern. """ if seperator is None: seperator = WORD_SEPARATOR start_group = non_capturing_group(*[str(p) for p in patterns]) pattern = wrap_pattern( (start_group if combine_all else patterns[0]) + non_capturing_group(seperator + start_group) + "*" ) return pattern # Fractions ELLA = Expression("[إا]لا")
[docs]THIRD = Value(1 / 3, optional_non_capturing_group("ال") + "[ثت]ل[ثت]")
""" Pattern that matches the pronunciation of third in Arabic """
[docs]QUARTER = Value(1 / 4, optional_non_capturing_group("ال") + "ربع")
""" Pattern that matches the pronunciation of quarter in Arabic """
[docs]HALF = Value(1 / 2, optional_non_capturing_group("ال") + "نصف?")
""" Pattern that matches the pronunciation of half in Arabic """
[docs]THREE_QUARTERS = Value(3 / 4, ELLA + EXPRESSION_SPACE + QUARTER)
""" Pattern that matches the pronunciation of three quarters in Arabic """ TWO_THIRDS = Value(2 / 3, ELLA + EXPRESSION_SPACE + THIRD) """ Pattern that matches the pronunciation of two thirds in Arabic """ # Connectors/Separators
[docs]WAW_CONNECTOR = Expression(EXPRESSION_SPACE + WAW + EXPRESSION_SPACE_OR_NONE)
""" Pattern that matches WAW as a connector between two words """
[docs]WORD_SEPARATOR = Expression( non_capturing_group( f"{EXPRESSION_SPACE_OR_NONE}{non_capturing_group(COMMA, ARABIC_COMMA)}" f"(?:{EXPRESSION_SPACE}{WAW})?", f"{EXPRESSION_SPACE}{WAW}", ) + non_capturing_group(r"\b", str(EXPRESSION_SPACE_OR_NONE)) )
""" Pattern that matches the word separator between numerals in Arabic """ # Common expressions
[docs]ALL_ALEF = Expression(f'[{"".join(ALEF_VARIATIONS)}]')
""" Pattern that matches all possible forms of the ALEF in Arabic """
[docs]TWO_SUFFIX = Expression(non_capturing_group("ين", "ان"))
""" Pattern that matches the two-suffix of words in Arabic """
[docs]SUM_SUFFIX = Expression(non_capturing_group("ين", "ون"))
""" Pattern that matches the sum-suffix of words in Arabic """
[docs]EXPRESSION_START = Expression( positive_lookbehind("^", r"\W", r"\b", r"\b" + WAW, r"\b" + LAM) )
""" Pattern that matches the start of a rule expression in Arabic """
[docs]EXPRESSION_END = Expression(positive_lookahead("$", r"\W", r"\b"))
""" Pattern that matches the end of a rule expression in Arabic """
[docs]FRACTIONS = ExpressionGroup(THREE_QUARTERS, TWO_THIRDS, QUARTER, HALF, THIRD)
[docs]TEH_OPTIONAL_SUFFIX = "[ةه]?"
[docs]AFTER = Expression(optional_non_capturing_group("[إا]لل?ي" + EXPRESSION_SPACE) + "بعد")
[docs]BEFORE = Expression( optional_non_capturing_group("[إا]لل?ي" + EXPRESSION_SPACE) + "[أاق]بل" )
[docs]AFTER_NEXT = Expression(spaced_patterns(AFTER, NEXT))
[docs]BEFORE_PREVIOUS = Expression(spaced_patterns(BEFORE, PREVIOUS))
[docs]IN_FROM_AT = Expression( non_capturing_group("في", "من", "خلال", "الموافق", "عند", "قراب[ةه]", "على") )
[docs]FROM = Expression(non_capturing_group("من"))
[docs]TO = Expression( optional_non_capturing_group(WAW) + EXPRESSION_SPACE_OR_NONE + non_capturing_group( "[اإ]لى", "حتى", "لل?", ) )