Source code for quantlaw.de_extract.statutes_parse

import itertools
from collections import Counter

from regex import regex

from quantlaw.de_extract.statutes_abstract import StatutesProcessor
from quantlaw.de_extract.statutes_parse_patterns import (
    numb_pattern,
    pre_numb_pattern,
    sgb_dict,
    split_citation_into_parts_pattern,
    split_citation_into_range_parts_pattern,
    split_unit_number_pattern,
    unit_patterns,
)
from quantlaw.de_extract.stemming import stem_law_name


[docs]class StringCaseException(Exception): """ Exception is raised if a unit in a reference cannot be parsed. In this case it is often an issue of upper oder lower case formatting. """ pass
[docs]class NoUnitMatched(Exception): """ Exception is raised if a unit in a refren cannot be parsed. """ pass
[docs]class StatutesParser(StatutesProcessor): """ Class to parse the content of a reference area identified by StatutesExtractor """
[docs] def parse_main(self, main_text: str) -> list: """ Parses a string containing a reference to a specific section within a given law. E.g. "§ 123 Abs. 4 Satz 5 und 6". The parsed informtaion is formatted into lists nested in lists nested in lists. The outer list is a list of references. References are lists of path components. A path component is e.g. "Abs. 4". A path component is represented by a list with two elements: The first contains the unit the second the value. The example above would be represented as `[[['§', '123'], ['Abs', '4'], ['Satz', '5']], [['§', '123'], ['Abs', '4'], ['Satz', '6']]]`. Args: main_text: string to parse Returns: The parsed reference. """ citation = self.fix_errors_in_citation(main_text.strip()) enum_parts = self.split_citation_into_enum_parts(citation) reference_paths = [] for enum_part in enum_parts: for string in enum_part: splitted_citation_part_list = list(self.split_citation_part(string)) if len(splitted_citation_part_list): reference_paths.append(splitted_citation_part_list) else: print(f"Empty citation part in {citation} in part {string}") reference_paths = self.split_parts_accidently_joined(reference_paths) for reference_path in reference_paths[1:]: prev_reference_path = reference_paths[ reference_paths.index(reference_path) - 1 ] self.infer_units(reference_path, prev_reference_path) return reference_paths
[docs] def parse_law(self, law_text: str, match_type: str, current_lawid: str = None): """ Parses the law information from a references found by StatutesMatchWithMainArea Args: main_text: E.g. "§ 123 Abs. 4 und 5 Nr. 6" law_text: E.g. "BGB" match_type: E.g. "dict" Returns: The key of a parse law. """ if match_type == "dict": lawname_stem = stem_law_name(law_text) match = self.match_law_name(lawname_stem) return self.laws_lookup[match] elif match_type == "sgb": lawid = sgb_dict[stem_law_name(law_text)] if type(lawid) is tuple: assert len(lawid) == 2 if lawid[0] in self.laws_lookup.values(): return lawid[0] elif lawid[1] in self.laws_lookup.values(): return lawid[1] else: return lawid[1] else: return lawid elif match_type == "internal": if current_lawid is None: raise Exception("Current law id must be set for internal reference") return current_lawid else: return None # match_type: ignore or unknown
[docs] @staticmethod def stem_unit(unit: str): """ Brings a unit into a standard format. E.g. removes abbreviations, grammatical differences spelling errors, etc. Args: unit: A string containing a unit that should be converted into a standard format. Returns: Unit in a standard format as string. E.g. §, Art, Nr, Halbsatz, Anhang, ... """ for unit_pattern in unit_patterns: if regex.fullmatch(unit_pattern, unit): return unit_patterns[unit_pattern] raise NoUnitMatched(unit)
[docs] @staticmethod def is_unit(token: str): """ Returns: True if the token is a unit """ return regex.fullmatch("|".join(unit_patterns.keys()), token)
[docs] @staticmethod def is_pre_numb(token: str): """ Returns: True if the token is a number that comes *before* the unit. E.g. '*erster* Halbsatz' """ return pre_numb_pattern.fullmatch( token, )
[docs] @staticmethod def is_numb(token: str): """ Returns: True if the token is a 'numeric' value of the reference. """ return numb_pattern.fullmatch( token, )
[docs] @staticmethod def fix_errors_in_citation(citation): """ Fix some common inconsistencies in the references such as double spaces. """ result = regex.sub(r"\s+", " ", citation) result = regex.sub(r"§(?=\d)", "§ ", result) result = regex.sub(r",\sbis\s", " bis ", result) return result
[docs] @staticmethod def split_citation_into_enum_parts(citation): """ A citation can contain references to multiple parts of the law. E.g. '§§ 20 und 35' or 'Art. 3 Abs. 1 Satz 1, Abs. 3 Satz 1'. The citation is split into parts so that each referenced section of the law is separated. E.g. '§§ 20' and '35' resp. 'Art. 3 Abs. 1 Satz 1' and 'Abs. 3 Satz 1'. However, ranges are not spit: E.g. "§§ 1 bis 10" will not be split. """ enum_parts = split_citation_into_parts_pattern.split( citation, ) # Split range enum_parts = [ split_citation_into_range_parts_pattern.split(part) for part in enum_parts ] return enum_parts
[docs] @staticmethod def split_parts_accidently_joined(reference_paths): """ Reformats the parsed references to separate accitently joined references. E.g. the original referehence "§ 123 § 126" will not be split by split_citation_into_enum_parts because the separation is falsly not indicated by a ',', 'or' etc. It come from the unit '§' that it can be inferred that the citation contains references to two parts of statutes. This function accounts for the case that the unit '§' or 'Art' appears twice in the same reference path and split the path into several elements. """ new_reference_paths = [] main_unit = ( "Art" if Counter([part[0] for part in itertools.chain(*reference_paths)]).get( "Art" ) else "§" ) for reference_path in reference_paths: temp_path = [] for part in reference_path: if part[0] == main_unit: if len(temp_path): new_reference_paths.append(temp_path) temp_path = [] temp_path.append(part) new_reference_paths.append(temp_path) return new_reference_paths
[docs] @staticmethod def infer_units(reference_path, prev_reference_path): """ In some cases of an enumeration a numeric value is not directed prefixed by the corresponding unit. E.g. "§ 123 Abs. 1 S. 2, 3 S. 4". In this case "3" is not prefixed with its unit. Instead it can be inferred by looking at the whole citation that it is next higher unit of "S.", hence "Abs.". These inferred units are added to parsed data. """ prev_path_units = [o[0] for o in prev_reference_path] if reference_path[0][0]: pass elif len(reference_path) > 1: try: prev_unit_index = prev_path_units.index(reference_path[1][0]) # if not prev_unit_index > 0: # print(f'Infer unit error: {citation}') reference_path[0][0] = prev_path_units[prev_unit_index - 1] except ValueError: reference_path[0][0] = prev_path_units[-1] else: reference_path[0][0] = prev_path_units[-1] try: prev_unit_index = prev_path_units.index(reference_path[0][0]) reference_path[0:0] = prev_reference_path[:prev_unit_index] except Exception: reference_path[0:0] = prev_reference_path
[docs] @staticmethod def split_citation_part(string: str): """ A string a tokenizes. Tokens are identified as units or values. Pairs are built to connect the units with their respective values. If the unit cannot be indentified (and must be inferred later) None is returned. Args: string: A string that is part of a reference and cites *one* part a statute. Retruns: As a generator tuples are returned, each containing the unit (or None) and the respecive value. """ # Tokenization # fmt: off string = regex.sub( r"(" r"\d+(?>\.\d+)?[a-z]?|" r"\b[ivx]+|" r"\b[a-z]\)?" r")" r"(\sff?\.|\sff\b)", r"\1ff.", string, flags=regex.IGNORECASE, ) # fmt: on tokens = split_unit_number_pattern.split( string, ) # Building pairs of units with their resp. values while len(tokens) > 0: token = tokens.pop(0) if StatutesParser.is_unit(token): if len(tokens) > 0: unit = StatutesParser.stem_unit(token) token = tokens.pop(0) numb = token assert StatutesParser.is_numb(numb), numb else: # when citation ends with unit print( f"Citation {string} ends with unit {token}. Ignoring last unit." ) break elif StatutesParser.is_pre_numb(token): numb = token token = tokens.pop(0) if not StatutesParser.is_unit(token): print(token, "is not a unit in", string) continue # to fix citation "§ 30 DRITTER ABSCHNITT" # Last part in now ignored, # but reference areas can still be improved. unit = StatutesParser.stem_unit(token) elif StatutesParser.is_numb(token): unit = None numb = token else: raise StringCaseException(token, "in", string) numb = regex.sub(r"(ff?\.|ff|\))$", "", numb) yield [unit, numb]