Source code for quantlaw.de_extract.load_statute_names

import json
import os
import re
import shutil
import zipfile

import lxml.etree
import requests

from quantlaw.de_extract.stemming import stem_law_name


[docs]def load_law_names(date, path):
    r = requests.get(
        f"https://github.com/QuantLaw/gesetze-im-internet/archive/{date}.zip",
        stream=True,
    )
    assert r.status_code == 200
    with open(path + ".zip", "wb") as f:
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)

    law_names = {}

    with zipfile.ZipFile(path + ".zip") as zip_file:
        for member_info in sorted(zip_file.namelist()):
            if member_info.endswith(".xml"):
                with zip_file.open(member_info) as member_file:
                    node = lxml.etree.parse(member_file)
                first_norm_nodes = node.xpath("(//norm)[1]")
                if not first_norm_nodes:
                    continue
                abk_nodes = first_norm_nodes[0].xpath(".//jurabk | //amtabk")
                if not abk_nodes:
                    continue
                abk = (
                    lxml.etree.tostring(abk_nodes[0], method="text", encoding="utf8")
                    .decode("utf8")
                    .strip()
                )
                abk_stem = re.sub(r"[^a-z0-9\-]", "_", abk.lower())

                law_names[stem_law_name(abk)] = abk_stem

                heading_nodes = first_norm_nodes[0].xpath(
                    ".//jurabk | //amtabk | " ".//langue | .//kurzue"
                )
                for heading_node in heading_nodes:
                    text = (
                        lxml.etree.tostring(
                            heading_node, method="text", encoding="utf8"
                        )
                        .decode("utf8")
                        .strip()
                    )
                    text = stem_law_name(text)
                    law_names[text] = abk_stem

    with open(path, "w", encoding="utf8") as f:
        json.dump(law_names, f, ensure_ascii=False, indent=0)

    os.remove(path + ".zip")
Source code for quantlaw.de_extract.load_statute_names

quantlaw

Navigation

Related Topics