Source code for maha.expressions.general

""" Regular expersion patterns """


__all__ = [
    "EXPRESSION_HASHTAGS",
    "EXPRESSION_MENTIONS",
    "EXPRESSION_LINKS",
    "EXPRESSION_EMAILS",
    "EXPRESSION_EMOJIS",
    "EXPRESSION_ALL_SPACES",
    "EXPRESSION_INTEGER",
    "EXPRESSION_DECIMAL",
    "EXPRESSION_SPACE",
    "EXPRESSION_SPACE_OR_NONE",
]

import re

from maha.constants import (
    AND_SIGN,
    ARABIC_COMMA,
    ARABIC_DECIMAL_SEPARATOR,
    ARABIC_NUMBERS,
    ARABIC_THOUSANDS_SEPARATOR,
    AT_SIGN,
    COMMA,
    ENGLISH_NUMBERS,
    HASHTAG,
    PUNCTUATIONS,
    SPACE,
    UNDERSCORE,
)
from maha.rexy import Expression

[docs]EXPRESSION_HASHTAGS = Expression( r"(?<=\s|^|\n|{})(#[\w-]+)\b".format( "|".join( [ re.escape(pun) for pun in PUNCTUATIONS if pun not in [AT_SIGN, AND_SIGN, UNDERSCORE] ] ), ) )
""" Expression that matches hashtags """
[docs]EXPRESSION_MENTIONS = Expression(EXPRESSION_HASHTAGS.pattern.replace(HASHTAG, AT_SIGN))
""" Expression that matches mentions """ # Adopted from https://gist.github.com/gruber/8891611 """ Liberal, Accurate Regex Expression for Matching Web URLs """ # Adopted from https://emailregex.com/
[docs]EXPRESSION_EMAILS = Expression(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)")
""" Expression that matches emails """ # Adopted from https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b # TODO: Validate that all of these are valid emojis
[docs]EXPRESSION_EMOJIS = Expression( "[" "\U0001F600-\U0001F64F" # emoticons "\U0001F300-\U0001F5FF" # symbols & pictographs "\U0001F680-\U0001F6FF" # transport & map symbols "\U0001F1E0-\U0001F1FF" # flags (iOS) "\U00002500-\U00002BEF" # chinese char "\U00002702-\U000027B0" "\U00002702-\U000027B0" "\U000024C2-\U0000FDEF" "\U0000FDFE-\U0001F251" "\U0001f926-\U0001f937" "\U00010000-\U0010ffff" "\u2640-\u2642" "\u2600-\u2B55" "\u200d" "\u23cf" "\u23e9" "\u231a" "\ufe0f" # dingbats "\u3030" "]+" )
""" Expression that matches emojis """
[docs]EXPRESSION_ALL_SPACES = Expression( r"[\u00A0\u1680\u2000-\u200B\u202F\u205F\u3000\uFEFF]" )
""" Expression that matches space variations. Normal space is not included. Taken from: https://jkorpela.fi/chars/spaces.html """
[docs]EXPRESSION_INTEGER = Expression( r"[+-]?(?:[{}](?:{})?)+%?".format( "".join(ARABIC_NUMBERS + ENGLISH_NUMBERS), "|".join([ARABIC_THOUSANDS_SEPARATOR, ARABIC_COMMA, COMMA, r"\s+\d"]), ) )
""" Expression that matches Arabic and English integers """
[docs]EXPRESSION_DECIMAL = Expression( r"[+-]?(?:[{0}](?:{1})?)*[.{2}](?:[{0}](?:{1})?)+%?".format( "".join(ARABIC_NUMBERS + ENGLISH_NUMBERS), "|".join([ARABIC_THOUSANDS_SEPARATOR, ARABIC_COMMA, COMMA, r"\s+\d"]), ARABIC_DECIMAL_SEPARATOR, ) )
""" Expression that matches Arabic and English decimals """
[docs]EXPRESSION_SPACE = Expression(r"\s+")
""" Expression that matches at least one whitespace """
[docs]EXPRESSION_SPACE_OR_NONE = Expression(r"\s*")
""" Expression that matches zero or more whitespaces """