Source code for maha.parsers.functions.parse_fn

"""Functions that extracts values from text"""

from __future__ import annotations

__all__ = ["parse", "parse_expression"]

from maha.constants import (
    ALL_HARAKAT,
    ARABIC,
    ARABIC_LETTERS,
    ARABIC_LIGATURES,
    ARABIC_NUMBERS,
    ARABIC_PUNCTUATIONS,
    EMPTY,
    ENGLISH,
    ENGLISH_CAPITAL_LETTERS,
    ENGLISH_LETTERS,
    ENGLISH_NUMBERS,
    ENGLISH_PUNCTUATIONS,
    ENGLISH_SMALL_LETTERS,
    HARAKAT,
    NUMBERS,
    PUNCTUATIONS,
    SPACE,
    TATWEEL,
)
from maha.expressions import (
    EXPRESSION_ARABIC_HASHTAGS,
    EXPRESSION_ARABIC_MENTIONS,
    EXPRESSION_EMAILS,
    EXPRESSION_EMOJIS,
    EXPRESSION_ENGLISH_HASHTAGS,
    EXPRESSION_ENGLISH_MENTIONS,
    EXPRESSION_HASHTAGS,
    EXPRESSION_LINKS,
    EXPRESSION_MENTIONS,
)
from maha.parsers.templates import Dimension, DimensionType, TextExpression
from maha.rexy import Expression, ExpressionGroup


[docs]def parse( text: str, arabic: bool = False, english: bool = False, arabic_letters: bool = False, english_letters: bool = False, english_small_letters: bool = False, english_capital_letters: bool = False, numbers: bool = False, harakat: bool = False, all_harakat: bool = False, tatweel: bool = False, punctuations: bool = False, arabic_numbers: bool = False, english_numbers: bool = False, arabic_punctuations: bool = False, english_punctuations: bool = False, arabic_ligatures: bool = False, arabic_hashtags: bool = False, arabic_mentions: bool = False, emails: bool = False, english_hashtags: bool = False, english_mentions: bool = False, hashtags: bool = False, links: bool = False, mentions: bool = False, emojis: bool = False, custom_expressions: ExpressionGroup | Expression | None = None, include_space=False, ) -> list[Dimension]: """Extracts certain characters/patterns from the given text. To add a new parameter, make sure that its name is the same as the corresponding constant. For the patterns, only remove the prefix ``EXPRESSION_`` from the parameter name .. admonition:: TO DO Add the ability to combine all expressions before parsing. Parameters ---------- text : str Text to be processed arabic : bool, optional Extract :data:`~.ARABIC` characters, by default False english : bool, optional Extract :data:`~.ENGLISH` characters, by default False arabic_letters : bool, optional Extract :data:`~.ARABIC_LETTERS` characters, by default False english_letters : bool, optional Extract :data:`~.ENGLISH_LETTERS` characters, by default False english_small_letters : bool, optional Extract :data:`~.ENGLISH_SMALL_LETTERS` characters, by default False english_capital_letters : bool, optional Extract :data:`~.ENGLISH_CAPITAL_LETTERS` characters, by default False numbers : bool, optional Extract :data:`~.NUMBERS` characters, by default False harakat : bool, optional Extract :data:`~.HARAKAT` characters, by default False all_harakat : bool, optional Extract :data:`~.ALL_HARAKAT` characters, by default False tatweel : bool, optional Extract :data:`~.TATWEEL` character, by default False punctuations : bool, optional Extract :data:`~.PUNCTUATIONS` characters, by default False arabic_numbers : bool, optional Extract :data:`~.ARABIC_NUMBERS` characters, by default False english_numbers : bool, optional Extract :data:`~.ENGLISH_NUMBERS` characters, by default False arabic_punctuations : bool, optional Extract :data:`~.ARABIC_PUNCTUATIONS` characters, by default False english_punctuations : bool, optional Extract :data:`~.ENGLISH_PUNCTUATIONS` characters, by default False arabic_ligatures : bool, optional Extract :data:`~.ARABIC_LIGATURES` words, by default False arabic_hashtags : bool, optional Extract Arabic hashtags using the expression :data:`~.EXPRESSION_ARABIC_HASHTAGS`, by default False arabic_mentions : bool, optional Extract Arabic mentions using the expression :data:`~.EXPRESSION_ARABIC_MENTIONS`, by default False emails : bool, optional Extract Arabic hashtags using the expression :data:`~.EXPRESSION_EMAILS`, by default False english_hashtags : bool, optional Extract Arabic hashtags using the expression :data:`~.EXPRESSION_ENGLISH_HASHTAGS`, by default False english_mentions : bool, optional Extract Arabic hashtags using the expression :data:`~.EXPRESSION_ENGLISH_MENTIONS`, by default False hashtags : bool, optional Extract Arabic hashtags using the expression :data:`~.EXPRESSION_HASHTAGS`, by default False links : bool, optional Extract Arabic hashtags using the expression :data:`~.EXPRESSION_LINKS`, by default False mentions : bool, optional Extract Arabic hashtags using the expression :data:`~.EXPRESSION_MENTIONS`, by default False emojis : bool, optional Extract emojis using the expression :data:`~.EXPRESSION_EMOJIS`, by default False custom_expressions : Union[:class:`~.ExpressionGroup`, :class:`~.Expression`], optional. Include any other string(s), by default None include_space : bool, optional Include the space expression :data:`~.EXPRESSION_SPACE` with all characters, by default False Returns ------- List[:class:`~.Dimension`] List of dimensions extracted from the text Raises ------ ValueError If no argument is set to True """ if not text: return [] # current function arguments current_arguments = locals() constants = globals() output = [] any_argument_set = False # Since each argument has the same name as the corresponding constant # (But, expressions should be prefixed with "EXPRESSION_" to match the actual expression.) # Looping through all arguments and appending constants that correspond to the # True arguments can work # TODO: Maybe find a good pythonic way to do this for arg, value in current_arguments.items(): const = constants.get(arg.upper()) if const and value is True: any_argument_set = True if include_space: pattern = f"(?:[{''.join(const)}](?:\\s+)?)+" else: pattern = f"[{''.join(const)}]+" text_exp = TextExpression(pattern) parsed = parse_expression(text, text_exp, DimensionType[arg.upper()]) output.extend(parsed) continue # check for expression expression: Expression | None = constants.get("EXPRESSION_" + arg.upper()) if expression and value is True: any_argument_set = True text_exp = TextExpression(str(expression)) parsed = parse_expression(text, text_exp, DimensionType[arg.upper()]) output.extend(parsed) if custom_expressions: any_argument_set = True output.extend(parse_expression(text, custom_expressions)) if not any_argument_set: raise ValueError("At least one argument should be True") return output
[docs]def parse_expression( text: str, expressions: ExpressionGroup | Expression, dimension_type: DimensionType = DimensionType.GENERAL, ) -> list[Dimension]: """ Extract matched strings in the given ``text`` using the input ``patterns`` Parameters ---------- text : str Text to check expressions : Union[:class:`~.ExpressionGroup`, :class:`~.Expression`] Expression(s) to use dimension_type : DimensionType Dimension type of the input ``expressions``, by default :attr:`.DimensionType.GENERAL` Returns ------- List[:class:`~.Dimension`] List of extracted dimensions Raises ------ ValueError If ``expressions`` are invalid """ if ( not expressions or (isinstance(expressions, Expression) and not expressions.pattern) or (isinstance(expressions, ExpressionGroup) and not expressions.expressions) ): raise ValueError("'expressions' cannot be empty.") # convert to ExpressionGroup if isinstance(expressions, Expression): expressions = ExpressionGroup(expressions) output = [] for result in expressions.parse(text): start = result.start end = result.end value = result.value body = text[start:end] output.append( Dimension(result.expression, body, value, start, end, dimension_type) ) return output