from __future__ import annotations
__all__ = [
"get_fractions_of_unit_pattern",
"get_fractions_of_pattern",
"wrap_pattern",
"spaced_patterns",
"THIRD",
"QUARTER",
"HALF",
"THREE_QUARTERS",
"WAW_CONNECTOR",
"WORD_SEPARATOR",
"ALL_ALEF",
"TWO_SUFFIX",
"SUM_SUFFIX",
"EXPRESSION_START",
"EXPRESSION_END",
"FRACTIONS",
"TEH_OPTIONAL_SUFFIX",
"AFTER",
"BEFORE",
"PREVIOUS",
"NEXT",
"AFTER_NEXT",
"BEFORE_PREVIOUS",
"IN_FROM_AT",
"FROM",
"TO",
]
from dataclasses import dataclass
from maha.constants import ALEF_VARIATIONS, ARABIC_COMMA, COMMA, LAM, WAW
from maha.expressions import EXPRESSION_SPACE, EXPRESSION_SPACE_OR_NONE
from maha.parsers.templates import Unit, Value
from maha.rexy import (
Expression,
ExpressionGroup,
non_capturing_group,
optional_non_capturing_group,
positive_lookahead,
positive_lookbehind,
)
@dataclass
class ValueUnit:
"""Represents a value with unit."""
value: float
unit: Unit
[docs]def get_fractions_of_unit_pattern(unit: str) -> str:
"""
Returns the fractions of a unit pattern.
Parameters
----------
unit: str
The unit pattern.
Returns
-------
str
Pattern for the fractions of the unit.
"""
return non_capturing_group(
spaced_patterns(unit, THREE_QUARTERS),
spaced_patterns(unit, TWO_THIRDS),
spaced_patterns(HALF, unit),
spaced_patterns(THIRD, unit),
spaced_patterns(QUARTER, unit),
)
[docs]def get_fractions_of_pattern(pattern: str) -> str:
"""
Returns the fractions of a pattern.
Parameters
----------
pattern: str
The pattern.
Returns
-------
str
Pattern for the fractions of the input pattern.
"""
return non_capturing_group(
spaced_patterns(
pattern,
optional_non_capturing_group(WAW + EXPRESSION_SPACE_OR_NONE)
+ THREE_QUARTERS,
),
spaced_patterns(
pattern,
optional_non_capturing_group(WAW + EXPRESSION_SPACE_OR_NONE) + TWO_THIRDS,
),
spaced_patterns(
pattern, optional_non_capturing_group(WAW + EXPRESSION_SPACE_OR_NONE) + HALF
),
spaced_patterns(
pattern,
optional_non_capturing_group(WAW + EXPRESSION_SPACE_OR_NONE) + THIRD,
),
spaced_patterns(
pattern,
optional_non_capturing_group(WAW + EXPRESSION_SPACE_OR_NONE) + QUARTER,
),
)
[docs]def wrap_pattern(pattern: str) -> str:
"""Adds start and end expression to the pattern."""
return EXPRESSION_START + pattern + EXPRESSION_END
[docs]def spaced_patterns(*patterns) -> str:
"""
Returns a regex pattern that matches any of the given patterns,
separated by spaces.
Parameters
----------
patterns
The patterns to match.
"""
return non_capturing_group(str(EXPRESSION_SPACE).join(str(p) for p in patterns))
def combine_patterns(
*patterns: str | Expression,
seperator: Expression | None = None,
combine_all=False,
) -> str:
"""
Intelligently combine following input patterns.
Parameters
----------
patterns :
The patterns to combine.
seperator :
The seperator to use. If None, the default seperator :data:`WORD_SEPARATOR`
is used.
combine_all :
If True, the start matches any of the input patterns. If False, the start
matches the first pattern only, followed by any combination of all other
patterns including the first pattern.
Returns
-------
str
The combined pattern.
"""
if seperator is None:
seperator = WORD_SEPARATOR
start_group = non_capturing_group(*[str(p) for p in patterns])
pattern = wrap_pattern(
(start_group if combine_all else patterns[0])
+ non_capturing_group(seperator + start_group)
+ "*"
)
return pattern
# Fractions
ELLA = Expression("[إا]لا")
[docs]THIRD = Value(1 / 3, optional_non_capturing_group("ال") + "[ثت]ل[ثت]")
""" Pattern that matches the pronunciation of third in Arabic """
[docs]QUARTER = Value(1 / 4, optional_non_capturing_group("ال") + "ربع")
""" Pattern that matches the pronunciation of quarter in Arabic """
[docs]HALF = Value(1 / 2, optional_non_capturing_group("ال") + "نصف?")
""" Pattern that matches the pronunciation of half in Arabic """
[docs]THREE_QUARTERS = Value(3 / 4, ELLA + EXPRESSION_SPACE + QUARTER)
""" Pattern that matches the pronunciation of three quarters in Arabic """
TWO_THIRDS = Value(2 / 3, ELLA + EXPRESSION_SPACE + THIRD)
""" Pattern that matches the pronunciation of two thirds in Arabic """
# Connectors/Separators
[docs]WAW_CONNECTOR = Expression(EXPRESSION_SPACE + WAW + EXPRESSION_SPACE_OR_NONE)
""" Pattern that matches WAW as a connector between two words """
[docs]WORD_SEPARATOR = Expression(
non_capturing_group(
f"{EXPRESSION_SPACE_OR_NONE}{non_capturing_group(COMMA, ARABIC_COMMA)}"
f"(?:{EXPRESSION_SPACE}{WAW})?",
f"{EXPRESSION_SPACE}{WAW}",
)
+ non_capturing_group(r"\b", str(EXPRESSION_SPACE_OR_NONE))
)
""" Pattern that matches the word separator between numerals in Arabic """
# Common expressions
[docs]ALL_ALEF = Expression(f'[{"".join(ALEF_VARIATIONS)}]')
""" Pattern that matches all possible forms of the ALEF in Arabic """
[docs]TWO_SUFFIX = Expression(non_capturing_group("ين", "ان"))
""" Pattern that matches the two-suffix of words in Arabic """
[docs]SUM_SUFFIX = Expression(non_capturing_group("ين", "ون"))
""" Pattern that matches the sum-suffix of words in Arabic """
[docs]EXPRESSION_START = Expression(
positive_lookbehind("^", r"\W", r"\b", r"\b" + WAW, r"\b" + LAM)
)
""" Pattern that matches the start of a rule expression in Arabic """
[docs]EXPRESSION_END = Expression(positive_lookahead("$", r"\W", r"\b"))
""" Pattern that matches the end of a rule expression in Arabic """
[docs]FRACTIONS = ExpressionGroup(THREE_QUARTERS, TWO_THIRDS, QUARTER, HALF, THIRD)
[docs]TEH_OPTIONAL_SUFFIX = "[ةه]?"
[docs]AFTER = Expression(optional_non_capturing_group("[إا]لل?ي" + EXPRESSION_SPACE) + "بعد")
[docs]BEFORE = Expression(
optional_non_capturing_group("[إا]لل?ي" + EXPRESSION_SPACE) + "[أاق]بل"
)
[docs]PREVIOUS = Expression(
non_capturing_group("الماضي?", "السابق", "المنصرم", "الفا[يئ]ت")
+ TEH_OPTIONAL_SUFFIX
)
[docs]NEXT = Expression(
non_capturing_group("الجاي", "القادم", "التالي?", "ال[اآ]تي?", "المقبل")
+ TEH_OPTIONAL_SUFFIX
)
[docs]AFTER_NEXT = Expression(spaced_patterns(AFTER, NEXT))
[docs]BEFORE_PREVIOUS = Expression(spaced_patterns(BEFORE, PREVIOUS))
[docs]IN_FROM_AT = Expression(
non_capturing_group("في", "من", "خلال", "الموافق", "عند", "قراب[ةه]", "على")
)
[docs]FROM = Expression(non_capturing_group("من"))
[docs]TO = Expression(
optional_non_capturing_group(WAW)
+ EXPRESSION_SPACE_OR_NONE
+ non_capturing_group(
"[اإ]لى",
"حتى",
"لل?",
)
)