Source code for quantlaw.de_extract.statutes_areas

from regex import regex

from quantlaw.de_extract.statutes_abstract import (
    StatusMatch,
    StatutesMatchWithMainArea,
    StatutesProcessor,
)
from quantlaw.de_extract.statutes_areas_patterns import (
    eu_law_name_pattern,
    ignore_law_name_pattern,
    reference_range_pattern,
    sgb_law_name_pattern,
    suffix_ignore_pattern,
)
from quantlaw.de_extract.stemming import stem_law_name


[docs]class StatutesExtractor(StatutesProcessor): """ Class to find areas of citations to German statutes and regulations """
[docs] def search(self, text: str, pos: int = 0) -> StatusMatch: """ Finds the next occurrence of a statute reference in a given text Args: text: The text to search in. pos: Position to start searching. Returns: The match or None if no references are found. """ # Find the main area of the reference match = reference_range_pattern.search(text, pos) if not match: return None # Found a trigger e.g "ยง" not no citation follows if not match.groupdict()["main"]: return StatusMatch( text=text, start=match.start(), end=match.end(), ) # Get length of optional suffix and law name that may follow the main area. # and categorize the reference type. suffix_len, law_len, law_match_type = self.get_suffix_and_law_name( text[match.end() :] ) # Create a return object statutes_match = StatutesMatchWithMainArea( text=text, start=match.start(), end=match.end(), suffix_len=suffix_len, law_len=law_len, law_match_type=law_match_type, ) return statutes_match
[docs] def find_all(self, text: str, pos: int = 0): """ Like search but returns a generator of all matches found in text """ curr_pos = pos match = self.search(text, curr_pos) while match: yield match curr_pos = match.end if match.has_main_area(): curr_pos += match.suffix_len + match.law_len match = self.search(text, curr_pos)
[docs] def get_suffix_and_law_name(self, string: str): """ Returns: A tuple containing length of 1. the article between numbers and law name (eg. " der ") 2. length of name of law as in the given string 3. The type of the reference. If not found lengths are 0. """ suffix_match = regex.match(r"^,?\s+?de[sr]\s+", string) if suffix_match: suffix_len = suffix_match.end() law_test = string[suffix_len : suffix_len + 1000] dict_suffix_len = self.get_dict_law_name_len(law_test) if dict_suffix_len: return suffix_len, dict_suffix_len, "dict" sgb_suffix_len = self.get_sgb_law_name_len(law_test) if sgb_suffix_len: return suffix_len, sgb_suffix_len, "sgb" eu_suffix_len = self.get_eu_law_name_len(law_test) if eu_suffix_len: return suffix_len, eu_suffix_len, "eu" ignore_suffix_len = self.get_ignore_law_name_len(law_test) if ignore_suffix_len: return suffix_len, ignore_suffix_len, "ignore" return suffix_len, 0, "unknown" else: # no der/des suffix suffix_match = regex.match(r"^[\s\n]+", string[:1000]) if suffix_match: suffix_len = len(suffix_match[0]) law_test = string[suffix_len:1000] dict_suffix_len = self.get_dict_law_name_len(law_test) if dict_suffix_len: return suffix_len, dict_suffix_len, "dict" sgb_suffix_len = self.get_sgb_law_name_len(law_test) if sgb_suffix_len: return suffix_len, sgb_suffix_len, "sgb" ignore_no_suffix_len = self.get_no_suffix_ignore_law_name_len(law_test) if ignore_no_suffix_len: return suffix_len, ignore_no_suffix_len, "ignore" return 0, 0, "internal"
[docs] def get_dict_law_name_len(self, test_str): """ Determines if the test_str starts with a law name given with self.laws_lookup. Returns: The length matched law name or 0. """ # Stem the test_str as the law names are already stemmed test_str_stem = stem_law_name(test_str) # Look for matching law names match = self.match_law_name(test_str_stem) if not match: return 0 # Transpose the area of the matched law name in the stemmed text to the # original text by splitting the original and the raw text into words (tokens) # and define the area of the original string that it contains of the same number # of tokens as the matched area in the stemmed string. test_str_splitted = regex.findall(r"[\w']+|[\W']+", test_str) match_splitted = regex.findall(r"[\w']+|[\W']+", match) match_raw = "".join(test_str_splitted[: len(match_splitted)]) assert len(test_str_splitted[0].strip()) > 0, (match, test_str, test_str_stem) # If last matched word of law name does continue after match with # a string that would not be stemmed, return no match # TODO look for other matches before returning no match last_word_test_stemmed = stem_law_name( test_str_splitted[len(match_splitted) - 1] ) last_word_match = match_splitted[-1] if last_word_match != last_word_test_stemmed: return 0 return len(match_raw)
[docs] @staticmethod def get_no_suffix_ignore_law_name_len(test_str) -> int: """ Returns: Length of the law name in chars, if no suffix is present that connects the main area with the law name or 0 if no law name of this type was found """ match = ignore_law_name_pattern.match( test_str, ) return len(match[0]) if match else 0
[docs] @staticmethod def get_sgb_law_name_len(test_str) -> int: """ Returns: The length of the SGB law name in chars or 0 if no law name of this type was found """ match = sgb_law_name_pattern.match( test_str, ) return len(match[0]) if match else 0
[docs] @staticmethod def get_eu_law_name_len(test_str) -> int: """ Returns: The length of the law name of european legislation in chars or 0 if no law name of this type was found """ match = eu_law_name_pattern.match( test_str, ) return len(match[0]) if match else 0
[docs] @staticmethod def get_ignore_law_name_len(test_str): """ Returns: Th length of a law name to ignore in chars or 0 if no law name of this type was found """ match = suffix_ignore_pattern.match(test_str) return len(match[0]) if match else 0