Source code for quantlaw.de_extract.statutes_parse_patterns

from regex import regex


[docs]def generate_sgb_dict():
    """
    Returns a dictionary, Its keys are different ways how SGB books are cited. They are
    mapped to values that represent the keys to the SGB books.

    """
    sgb_dict_word = [
        "erst",
        "zweit",
        "dritt",
        "viert",
        "fuenft",
        "sechst",
        "siebt",
        "acht",
        "neunt",
        "zehnt",
        "elft",
        "zwoelft",
    ]

    sgb_dict_roman = [
        "i",
        "ii",
        "iii",
        "iv",
        "v",
        "vi",
        "vii",
        "viii",
        "ix",
        "x",
        "xi",
        "xii",
    ]

    sgb_dict = {}

    # Iterate through the 12 books and add different ways to cite them to the sgb_dict
    for idx in range(12):
        nr = idx + 1
        word = sgb_dict_word[idx]
        roman = sgb_dict_roman[idx]

        # Books 9 and 10 appear to have a roman numbering in the abbreviation by
        # juris and gesetze-im-internet. Abbreviations of the other books are always
        # contain arabic numbering.
        if nr in {9, 10}:
            value = (f"SGB-{roman.upper()}", f"SGB-{nr}")
        else:
            value = f"SGB-{nr}"

        sgb_dict[f"{word} buch"] = value
        sgb_dict[f"{word} buch sozialgesetzbuch"] = value
        sgb_dict[f"{word} buch d sozialgesetzbuch"] = value
        sgb_dict[f"sgb {roman}"] = value
        sgb_dict[f"sgb {nr}"] = value
        sgb_dict[f"{nr}. buch sozialgesetzbuch"] = value
        sgb_dict[f"sgb-{roman}"] = value
        sgb_dict[f"sgb-{nr}"] = value

    return sgb_dict


sgb_dict = generate_sgb_dict()


unit_patterns = {
    r"§{1,2}": "§",
    r"Art\b\.?|[Aa]rtikels?n?": "Art",
    r"Nr\b\.?|Nummer|Nrn?\b\.?": "Nr",
    r"[Aa][Bb][Ss]\b\.?|Absatz|Absätze": "Abs",
    r"Unter[Aa]bsatz|Unter[Aa]bs\b\.?": "Uabs",
    r"S\b\.?|Satz|Sätze": "Satz",
    r"Ziffern?|Ziffn?\b\.?": "Ziffer",
    r"Buchstaben?|Buchst\b\.?": "Buchstabe",
    r"Halbsatz": "Halbsatz",
    r"Teilsatz": "Teilsatz",
    r"Abschnitte?|Abschn\b\.?": "Abschnitt",
    r"Alternativen?|Alt\b\.?": "Alternative",
    r"Anhang|Anhänge": "Anhang",
}

# fmt: off
pre_numb_pattern = regex.compile(
    r"("
    r"erste|"
    r"zweite|"
    r"dritte|"
    r"letzte"
    r")r?s?",
    flags=regex.IGNORECASE,
)


numb_pattern = regex.compile(
    r"("
    r"\d+(?>\.\d+)*[a-z]?|"
    r"[ivx]+|"
    r"[a-z]\)?"
    r")"
    r"("
    r"ff?\.|"
    r"ff\b|"
    r"(?<=[a-z])\)|"
    r"\b"
    r")",
    flags=regex.IGNORECASE,
)


split_citation_into_parts_pattern_str = (
    r"(?>\s*,?(?>" r",\s*|" r"\s+und\s+|" r"\s+sowie\s+|"
    #             r'\s+bis\s+|'
    r"\s+oder\s+|"
    r"(?>\s+jeweils)?(?>\s+auch)?\s+(?>in\s+Verbindung\s+mit|i\.?\s?V\.?\s?m\.?)\s+"
    r"))"
    r"(?>nach\s+)?"
    r"(?>(?>der|des|den|die)\s+)?"
)
# fmt: on

split_citation_into_parts_pattern = regex.compile(
    split_citation_into_parts_pattern_str,
    flags=regex.IGNORECASE,
)

split_citation_into_range_parts_pattern = regex.compile(r"\s*,?\s+bis\s+")

split_unit_number_pattern_str = (
    r"\s|(?<=Art\.|Art\b|Artikeln|Artikel)(?=\d)|(?<=§)(?=[A-Z0-9])"
)
split_unit_number_pattern = regex.compile(
    split_unit_number_pattern_str, flags=regex.IGNORECASE
)
Source code for quantlaw.de_extract.statutes_parse_patterns

quantlaw

Navigation

Related Topics