Source code for quantlaw.de_extract.load_statute_names

import json
import os
import re
import shutil
import zipfile

import lxml.etree
import requests

from quantlaw.de_extract.stemming import stem_law_name


[docs]def load_law_names(date, path): r = requests.get( f"https://github.com/QuantLaw/gesetze-im-internet/archive/{date}.zip", stream=True, ) assert r.status_code == 200 with open(path + ".zip", "wb") as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) law_names = {} with zipfile.ZipFile(path + ".zip") as zip_file: for member_info in sorted(zip_file.namelist()): if member_info.endswith(".xml"): with zip_file.open(member_info) as member_file: node = lxml.etree.parse(member_file) first_norm_nodes = node.xpath("(//norm)[1]") if not first_norm_nodes: continue abk_nodes = first_norm_nodes[0].xpath(".//jurabk | //amtabk") if not abk_nodes: continue abk = ( lxml.etree.tostring(abk_nodes[0], method="text", encoding="utf8") .decode("utf8") .strip() ) abk_stem = re.sub(r"[^a-z0-9\-]", "_", abk.lower()) law_names[stem_law_name(abk)] = abk_stem heading_nodes = first_norm_nodes[0].xpath( ".//jurabk | //amtabk | " ".//langue | .//kurzue" ) for heading_node in heading_nodes: text = ( lxml.etree.tostring( heading_node, method="text", encoding="utf8" ) .decode("utf8") .strip() ) text = stem_law_name(text) law_names[text] = abk_stem with open(path, "w", encoding="utf8") as f: json.dump(law_names, f, ensure_ascii=False, indent=0) os.remove(path + ".zip")