Source code for maha.cleaners.functions.normalize_fn

"""
Special functions that convert similar characters into one common character
(Characters that roughly have the same shape)
"""
from __future__ import annotations

__all__ = ["normalize", "normalize_lam_alef", "normalize_small_alef"]


import maha.cleaners.functions as functions
from maha.constants import (
    ALEF,
    ALEF_MADDA_ABOVE,
    ALEF_SUPERSCRIPT,
    ALEF_VARIATIONS,
    ARABIC_LIGATURES,
    ARABIC_LIGATURES_NORMALIZED,
    EMPTY,
    HEH,
    LAM,
    LAM_ALEF_VARIATIONS,
    LAM_ALEF_VARIATIONS_NORMALIZED,
    MADDAH_ABOVE,
    SPACE,
    TEH_MARBUTA,
    WAW,
    WAW_VARIATIONS,
    YEH,
    YEH_VARIATIONS,
)
from maha.expressions import EXPRESSION_ALL_SPACES


[docs]def normalize(
    text: str,
    lam_alef: bool | None = None,
    alef: bool | None = None,
    waw: bool | None = None,
    yeh: bool | None = None,
    teh_marbuta: bool | None = None,
    ligatures: bool | None = None,
    spaces: bool | None = None,
    all: bool = False,
) -> str:
    """Normalizes characters in the given text

    Parameters
    ----------
    text : str
        Text to process
    lam_alef : bool, optional
        Normalize :data:`~.LAM_ALEF_VARIATIONS` characters to :data:`~.LAM` and
        :data:`~.ALEF`, by default None
    alef : bool, optional
        Normalize :data:`~.ALEF_VARIATIONS` characters to :data:`~.ALEF`,
        by default None
    waw : bool, optional
        Normalize :data:`~.WAW_VARIATIONS` characters to :data:`~.WAW`,
        by default None
    yeh : bool, optional
        Normalize :data:`~.YEH_VARIATIONS` characters to :data:`~.YEH` and
        :data:`~.ALEF`, by default None
    teh_marbuta : bool, optional
        Normalize :data:`~.TEH_MARBUTA` characters to :data:`~.HEH`, by default None
    ligatures : bool, optional
        Normalize :data:`~.ARABIC_LIGATURES` characters to the corresponding indices
        in :data:`~.ARABIC_LIGATURES_NORMALIZED`, by default None
    spaces : bool, optional
        Normalize space variations using the expression :data:`~.EXPRESSION_ALL_SPACES`,
        by default None
    all : bool, optional
        Do all normalization except the ones that are set to False, by default False

    Returns
    -------
    str
        Processed text

    Raises
    ------
    ValueError
        If no argument is set to True

    Examples
    --------
    .. code:: pycon

        >>> from maha.cleaners.functions import normalize
        >>> text = "عن أبي هريرة"
        >>> normalize(text, alef=True, teh_marbuta=True)
        'عن ابي هريره'

    .. code:: pycon

        >>> from maha.cleaners.functions import normalize
        >>> text = "قال رسول الله ﷺ"
        >>> normalize(text, ligatures=True)
        'قال رسول الله صلى الله عليه وسلم'

    .. code:: pycon

        >>> from maha.cleaners.functions import normalize
        >>> text = "قال مؤمن: ﷽ قل هو ﷲ أحد"
        ... # For space
        >>> normalize(text, all=True, waw=False)
        'قال مؤمن: بسم الله الرحمن الرحيم قل هو الله احد'
    """
    if not text:
        return EMPTY

    if not (
        lam_alef or alef or waw or yeh or teh_marbuta or ligatures or spaces or all
    ):
        raise ValueError("At least one argument should be True")

    output = text
    if lam_alef or (all and lam_alef is not False) or (all and lam_alef is not False):
        output = functions.replace(output, LAM_ALEF_VARIATIONS, LAM + ALEF)
    if alef or (all and alef is not False):
        output = functions.replace(output, ALEF_VARIATIONS, ALEF)
    if waw or (all and waw is not False):
        output = functions.replace(output, WAW_VARIATIONS, WAW)
    if yeh or (all and yeh is not False):
        output = functions.replace(output, YEH_VARIATIONS, YEH)
    if teh_marbuta or (all and teh_marbuta is not False):
        output = functions.replace(output, TEH_MARBUTA, HEH)
    if ligatures or (all and ligatures is not False):
        output = functions.replace_pairs(
            output, ARABIC_LIGATURES, ARABIC_LIGATURES_NORMALIZED
        )
    if spaces or (all and spaces is not False):
        output = functions.replace_expression(output, EXPRESSION_ALL_SPACES, SPACE)

    return output


[docs]def normalize_lam_alef(text: str, keep_hamza: bool = True) -> str:
    """Normalize :data:`~.LAM_ALEF_VARIATIONS` to
    :data:`~.LAM_ALEF_VARIATIONS_NORMALIZED` If ``keep_hamza`` is True. Otherwise,
    normalize to :data:`~.LAM` and :data:`~.ALEF`

    Parameters
    ----------
    text : str
        Text to process
    keep_hamza : bool, optional
        True to preserve hamza and madda characters, by default True

    Returns
    -------
    str
        Normalized text

    Examples
    --------
    .. code:: pycon

        >>> from maha.cleaners.functions import normalize_lam_alef
        >>> text = "السﻻم عليكم أحبتي، قالوا في صِفَةِ رَسُولِ الله يتَﻷلأ وَجْهُه"
        >>> normalize_lam_alef(text)
        'السلام عليكم أحبتي، قالوا في صِفَةِ رَسُولِ الله يتَلألأ وَجْهُه'

    .. code:: pycon

        >>> from maha.cleaners.functions import normalize_lam_alef
        >>> text = "اﻵن يا أصحابي"
        >>> normalize_lam_alef(text, keep_hamza=False)
        'الان يا أصحابي'
    """
    if keep_hamza:
        output = functions.replace_pairs(
            text, LAM_ALEF_VARIATIONS, LAM_ALEF_VARIATIONS_NORMALIZED
        )
    else:
        output = functions.replace(text, LAM_ALEF_VARIATIONS, LAM + ALEF)

    return output


[docs]def normalize_small_alef(
    text: str, keep_madda: bool = True, normalize_end: bool = False
) -> str:
    """Normalize :data:`~.ALEF_SUPERSCRIPT` to :data:`~.ALEF`. If ``keep_madda`` is True
    and :data:`~.ALEF_SUPERSCRIPT` is followed by :data:`HAMZA_ABOVE`, then normalize
    to :data:`~.ALEF_MADDA_ABOVE`

    Parameters
    ----------
    text : str
        Text to process
    keep_madda : bool, optional
        True to preserve madda character, by default True
    normalize_end : bool, optional
        True to normalize :data:`~.ALEF_SUPERSCRIPT` that appear at the end of a word,
        by default False

    Returns
    -------
    str
        Normalized text

    Example
    -------
    .. code:: pycon

        >>> from maha.cleaners.functions import normalize_small_alef
        >>> text = "وَٱلصَّٰٓفَّٰتِ صَفّٗا"
        >>> normalize_small_alef(text)
        'وَٱلصَّآفَّاتِ صَفّٗا'
    """
    output = text
    if keep_madda:
        output = functions.replace_pairs(
            text, [ALEF_SUPERSCRIPT + MADDAH_ABOVE], [ALEF_MADDA_ABOVE]
        )
    if not normalize_end:
        output = functions.replace_expression(
            output, rf"{ALEF_SUPERSCRIPT}(?!\s|$)", ALEF
        )
    else:
        output = functions.replace(output, ALEF_SUPERSCRIPT, ALEF)

    return output


# def normalize_hamza(text: str):
#     ? Should this method be implemented?
#     * This method normalizes [HAMZA_WAW, HAMZA_YA, HAMZA] to HAMZA
#     raise NotImplementedError()