Source code for cookbase.parsers.jsonfoodex

"""Parsing suite for the Cookbase platform from `FoodEx2`_  data into JSON documents.

The main command, :option:`parsexml`, allows for lossless translation from `FoodEx2`_
XML data into a collection of JSON documents. Nonetheless, it also permits to filter out
and discard the desired hierarchies together with the ingredients that belong only to
those hierarchies. Field contents are parsed into Python built-in types (:keyword:`str`,
:keyword:`int` and :keyword:`bool`). The original ordering and format are respected,
however there are a number of particularities when mapping into JSON to be considered:

    - The JSON output represents the content of the root :const:`<catalogue>` tag.
    - The :const:`<hierarchyGroups>` tag is mapped into JSON object that holds an array
      with the text from each contained :const:`<hierarchyGroup>` tag.
    - The :const:`<hierarchyAssignment>` tag is mapped into a JSON object whose key is
      the :const:`<hierarchyCode>` tag content, and the value is a JSON document
      including all its data.
    - The :const:`<implicitAttribute>` tag is mapped into a JSON object whose key is the
      :const:`<attributeCode>` tag content, and the value is an array with the text from
      each contained :const:`<attributeValue>` tag.

The :option:`-d`/:option:`--discardedhierarchies` option lets the user choose whether or
not to discard any desired hierarchy (including the terms that are only related to them)
by providing a list of hierarchy codes. By default, if not used, all hierarchies not
directly related to food preparation are discarded: :const:`botanic`, :const:`pest`,
:const:`biomo`, :const:`legis`, :const:`feed`, :const:`partcon`, :const:`place`,
:const:`vetdrug`, :const:`report`, :const:`fpurpose`, :const:`replev`, :const:`targcon`
and :const:`feedAddExpo`. In case of wanting not to discard any hierarchy, the
:option:`-d`/:option:`--discardedhierarchies` flag should be used providing no
hierarchies to discard.

The :option:`-cb`/:option:`--cookbase` flag argument indicates to generate identifiers
(:code:`_id`) for each catalogue term suitable for the Cookbase platform.

The :option:`hierarchize` command permits to build a JSON document describing a
hierarchy tree.

"""
import argparse
import json
import os
from collections import OrderedDict
from math import ceil
from time import time
from typing import Any, Dict, Optional, Tuple
from xml.dom import minidom

from cookbase.utils import _HelpAction

# from cookbase.parsers import termcode # this import is performed in-place
vprint = None


[docs]def parsexml(args: argparse.Namespace) -> None:
    """Method implementing the parsing logic.

    :param args: Command-line arguments
    :type args: argparse.Namespace
    """
    if args.cookbase:
        try:
            from cookbase.parsers import termcode
        except ImportError:
            print(
                "   WARNING: cookbase.parsers.termcode is not included under"
                + "PYTHONPATH. The parser will not generate '_id' fields."
            )
            args.cookbase = False

    start_time = time()
    generic_term_counter = 0
    expo_term_counter = 0

    vprint("loading and parsing the FoodEx2 Matrix...")

    foodex_xml = minidom.parse(args.inputfile)
    catalogue = OrderedDict()

    # including information about the parser configuration
    vprint("including information about the parser configuration...")

    catalogue["parserInfo"] = {"discardedHierarchies": args.discardedhierarchies}

    # parsing the <catalogueDesc> node
    vprint("parsing the <catalogueDesc> node...")

    node = foodex_xml.getElementsByTagName("catalogueDesc")[0]
    catalogue["catalogueDesc"] = OrderedDict()

    for i in node.childNodes:
        if i.nodeType != i.TEXT_NODE:
            if i.nodeName == "termCodeLength":
                catalogue["catalogueDesc"][i.nodeName] = int(i.firstChild.nodeValue)
            elif i.nodeName in ["acceptNonStandardCodes", "generateMissingCodes"]:
                if i.firstChild.nodeValue == "true":
                    catalogue["catalogueDesc"][i.nodeName] = True
                else:
                    catalogue["catalogueDesc"][i.nodeName] = False
            else:
                catalogue["catalogueDesc"][i.nodeName] = i.firstChild.nodeValue

    # parsing the <catalogueVersion> node
    vprint("parsing the <catalogueVersion> node...")

    node = foodex_xml.getElementsByTagName("catalogueVersion")[0]
    catalogue["catalogueVersion"] = OrderedDict()

    for i in node.childNodes:
        if i.nodeType != i.TEXT_NODE:
            catalogue["catalogueVersion"][i.nodeName] = i.firstChild.nodeValue

    # parsing the <catalogueGroups> node
    vprint("parsing the <catalogueGroups> node...")

    node = foodex_xml.getElementsByTagName("catalogueGroups")[0]
    catalogue["catalogueGroups"] = OrderedDict()
    catalogue["catalogueGroups"][node.childNodes[1].nodeName] = node.childNodes[
        1
    ].firstChild.nodeValue

    del node

    # parsing the <catalogueHierarchies> node
    vprint("parsing the <catalogueHierarchies> node...")

    hierarchy_nodes = foodex_xml.getElementsByTagName("hierarchy")

    # checking whether or not the hierarchies to discard exist
    for i in args.discardedhierarchies:
        f = False

        for j in hierarchy_nodes:
            if i == j.getElementsByTagName("code")[0].firstChild.nodeValue:
                f = True
                break

        if not f:
            print(
                '   WARNING: the hierarchy "'
                + i
                + '" listed to discard does not exist in this catalogue'
            )

    catalogue["catalogueHierarchies"] = OrderedDict()

    for hierarchy_node in hierarchy_nodes:
        code = hierarchy_node.getElementsByTagName("code")[0].firstChild.nodeValue

        if code not in args.discardedhierarchies:
            hierarchy_dict = OrderedDict()

            for i in hierarchy_node.childNodes:
                if i.nodeType != i.TEXT_NODE:
                    if i.nodeName == "hierarchyGroups":
                        hierarchyGroups_list = []

                        for j in i.childNodes:
                            if j.nodeType != j.TEXT_NODE:
                                hierarchyGroups_list.append(j.firstChild.nodeValue)
                        hierarchy_dict[i.nodeName] = hierarchyGroups_list
                    else:
                        hierarchy_dict[i.nodeName] = OrderedDict()

                        for j in i.childNodes:
                            if j.nodeType != j.TEXT_NODE:
                                if j.nodeName == "hierarchyOrder":
                                    hierarchy_dict[i.nodeName][j.nodeName] = int(
                                        j.firstChild.nodeValue
                                    )
                                elif j.nodeType != j.TEXT_NODE:
                                    hierarchy_dict[i.nodeName][
                                        j.nodeName
                                    ] = j.firstChild.nodeValue

            catalogue["catalogueHierarchies"][code] = hierarchy_dict

    del hierarchy_nodes, hierarchy_dict

    # parsing the <catalogueAttributes> node
    vprint("parsing the <catalogueAttributes> node")

    attribute_nodes = foodex_xml.getElementsByTagName("attribute")
    catalogue["catalogueAttributes"] = OrderedDict()

    for attribute_node in attribute_nodes:
        code = attribute_node.getElementsByTagName("code")[0].firstChild.nodeValue

        attribute_dict = OrderedDict()

        for i in attribute_node.childNodes:
            if i.nodeType != i.TEXT_NODE:
                attribute_dict[i.nodeName] = OrderedDict()

                for j in i.childNodes:
                    if j.nodeType != j.TEXT_NODE:
                        if j.nodeName in ["attributeOrder", "attributeMaxLength"]:
                            attribute_dict[i.nodeName][j.nodeName] = int(
                                j.firstChild.nodeValue
                            )
                        elif j.nodeName in [
                            "attributeVisible",
                            "attributeSearchable",
                            "attributeUniqueness",
                            "attributeTermCodeAlias",
                        ]:
                            if j.firstChild.nodeValue == "true":
                                attribute_dict[i.nodeName][j.nodeName] = True
                            else:
                                attribute_dict[i.nodeName][j.nodeName] = False
                        else:
                            attribute_dict[i.nodeName][
                                j.nodeName
                            ] = j.firstChild.nodeValue

        catalogue["catalogueAttributes"][code] = attribute_dict

    del attribute_nodes, attribute_dict

    # parsing the <catalogueTerms> node
    vprint("parsing the <catalogueTerms> node...")

    term_nodes = foodex_xml.getElementsByTagName("term")

    if args.termsfile:
        catalogue_terms = []
    else:
        catalogue["catalogueTerms"] = OrderedDict()

    for term_node in term_nodes:
        # checking whether the term should be filtered out
        hierarchy_codes = set()

        for i in term_node.getElementsByTagName("hierarchyCode"):
            hierarchy_codes.add(i.firstChild.nodeValue)

        # MTX hierarchy is by default always present
        if len(hierarchy_codes.difference(args.discardedhierarchies)) <= 1:
            continue
        else:
            generic_term_counter += 1

        termCode = term_node.getElementsByTagName("termCode")[0].firstChild.nodeValue

        term_dict = OrderedDict()

        if args.cookbase:
            term_dict["_id"] = termcode.to_int(termCode)

        for i in term_node.childNodes:
            if i.nodeType != i.TEXT_NODE:
                term_dict[i.nodeName] = OrderedDict()

                if i.nodeName == "hierarchyAssignments":
                    for j in i.childNodes:
                        if j.nodeType != j.TEXT_NODE:
                            hierarchy_code = j.getElementsByTagName("hierarchyCode")[
                                0
                            ].firstChild.nodeValue

                            # updating counter
                            if hierarchy_code == "expo":
                                expo_term_counter += 1

                            # checking discarded hierarchy
                            if hierarchy_code in args.discardedhierarchies:
                                continue

                            term_dict[i.nodeName][hierarchy_code] = OrderedDict()

                            for k in j.childNodes:
                                if k.nodeType != k.TEXT_NODE:
                                    if k.nodeName == "order":
                                        term_dict[i.nodeName][hierarchy_code][
                                            k.nodeName
                                        ] = int(k.firstChild.nodeValue)
                                    elif k.nodeName == "reportable":
                                        if k.firstChild.nodeValue == "true":
                                            term_dict[i.nodeName][hierarchy_code][
                                                k.nodeName
                                            ] = True
                                        else:
                                            term_dict[i.nodeName][hierarchy_code][
                                                k.nodeName
                                            ] = False
                                    else:
                                        term_dict[i.nodeName][hierarchy_code][
                                            k.nodeName
                                        ] = k.firstChild.nodeValue
                elif i.nodeName == "implicitAttributes":
                    for j in i.childNodes:
                        if j.nodeType != j.TEXT_NODE:
                            for k in j.childNodes:
                                if k.nodeName == "attributeCode":
                                    attributeCode = k.firstChild.nodeValue
                                    break

                            attributeValues_list = []

                            for k in j.childNodes:
                                if k.nodeName == "attributeValues":
                                    for q in k.childNodes:
                                        if q.nodeType != q.TEXT_NODE:
                                            attributeValues_list.append(
                                                q.firstChild.nodeValue
                                            )

                                    term_dict[i.nodeName][
                                        attributeCode
                                    ] = attributeValues_list
                else:
                    for j in i.childNodes:
                        if j.nodeType != j.TEXT_NODE:
                            term_dict[i.nodeName][j.nodeName] = j.firstChild.nodeValue

        if args.termsfile:
            catalogue_terms.append(term_dict)
        else:
            catalogue["catalogueTerms"][termCode] = term_dict

    del term_node, term_dict

    vprint("writing output...")

    with open(args.outputfile, "w") as f:
        json.dump(catalogue, f, indent=2)

    if args.termsfile:
        if args.single:
            for i in catalogue_terms:
                with open(
                    args.termsfile + "/" + i["termDesc"]["termCode"] + ".json", "w"
                ) as f:
                    json.dump(i, f, indent=2)
        elif args.nchunks:
            split_index = ceil(len(catalogue_terms) / args.nchunks)
            for i in range(1, args.nchunks + 1):
                start = split_index * (i - 1)
                end = split_index * i
                with open(args.termsfile + "." + str(i), "w") as f:
                    json.dump(catalogue_terms[start:end], f, indent=2)
        else:
            with open(args.termsfile, "w") as f:
                json.dump(catalogue_terms, f, indent=2)

    vprint("Time elapsed: " + str(ceil(time() - start_time)) + " seconds")
    vprint("Total number of terms: " + str(generic_term_counter))
    vprint("Number of terms in the Exposure Hierarchy: " + str(expo_term_counter))


def _traverse_hierarchy(
    hierarchy_siblings: Dict[str, Any],
    term_to_attach: Optional[Tuple[str, str, Dict[str, Any]]] = None,
) -> bool:
    """Generates a JSON document describing a hierarchy tree.

    :param hierarchy_siblings: A dictionary describing all (already scanned) hierarchy
      terms sharing the same parent
    :type hierarchy_siblings: dict[str, Any]
    :param term_to_attach: Data describing a term to be attached to hierarchy
    :type term_to_attach: tuple[str, str, dict[str, Any]], optional
    :return: :const:`True` if a term was attached to hierarchy, :const:`False` otherwise
    :rtype: bool
    """
    for i in hierarchy_siblings.values():
        if term_to_attach and i["termCode"] == term_to_attach[0]:
            try:
                i["children"][term_to_attach[1]] = term_to_attach[2]
            except KeyError:
                i["children"] = {}
                i["children"][term_to_attach[1]] = term_to_attach[2]
            return True
        elif "children" in i and _traverse_hierarchy(i["children"], term_to_attach):
            return True

    return False


[docs]def hierarchize(args: argparse.Namespace) -> None:
    """Generates a JSON document describing a hierarchy tree.

    :param args: Command-line arguments
    :type args: argparse.Namespace
    """
    hierarchy = {}
    buffer = []
    start_time = time()
    hierarchy_term_counter = 0

    vprint("loading terms and including them selectively by hierarchy...")

    for e in os.scandir(args.inputfolder):
        if e.path.endswith(".json"):
            with open(e.path) as f:
                term = json.load(f)
            try:
                parent_code = term["hierarchyAssignments"][args.hierarchycode][
                    "parentCode"
                ]
            except KeyError:
                continue

            tta = (
                parent_code,
                term["termDesc"]["termExtendedName"],
                {
                    "termCode": term["termDesc"]["termCode"],
                    "reportable": term["hierarchyAssignments"][args.hierarchycode][
                        "reportable"
                    ],
                    "order": term["hierarchyAssignments"][args.hierarchycode]["order"],
                },
            )

            if parent_code == "root":
                hierarchy[term["termDesc"]["termExtendedName"]] = tta[2]
            elif not _traverse_hierarchy(hierarchy, tta):
                buffer.append(tta)

            hierarchy_term_counter += 1

    vprint("clearing pool of unattached hierarchy terms...")

    while len(buffer) > 0:
        for i in buffer:
            if _traverse_hierarchy(hierarchy, i):
                buffer.remove(i)

    vprint("writing output...")

    with open(args.outputfile, "w") as f:
        json.dump(hierarchy, f, indent=2)

    vprint("Time elapsed: " + str(ceil(time() - start_time)) + " seconds")
    vprint("Total number of terms in hierarchy: " + str(hierarchy_term_counter))


def _main() -> None:
    """Command-line parser."""
    ap = argparse.ArgumentParser(
        description="jsonfoodex - A parsing suite"
        + " for the Cookbase platform which"
        + " transforms from the standard FoodEx2"
        + " XML-formatted file into JSON files.",
        add_help=False,
    )
    ap.add_argument(
        "-h", "--help", action=_HelpAction, help="show this help message and exit"
    )
    subparsers = ap.add_subparsers(dest="command")
    subparsers.required = True
    parsexml_parser = subparsers.add_parser(
        "parsexml", help="parse a FoodEx2 XML file into JSON files"
    )
    parsexml_parser.add_argument("inputfile", help="path to the XML input file")
    parsexml_parser.add_argument("outputfile", help="path to the JSON output file")
    parsexml_parser.add_argument(
        "-t",
        "--termsfile",
        help="path to the JSON separated output file for "
        + "FoodEx2 terms, or folder if flag -s is used",
    )
    pg = parsexml_parser.add_mutually_exclusive_group()
    pg.add_argument(
        "-n",
        "--nchunks",
        type=int,
        help="number of chunks to split the terms output file",
    )
    pg.add_argument(
        "-s",
        "--single",
        action="store_true",
        help="indicate output terms in single files",
    )
    parsexml_parser.add_argument(
        "-v", "--verbose", action="count", default=0, help="increase output verbosity"
    )
    parsexml_parser.add_argument(
        "-d",
        "--discardedhierarchies",
        nargs="*",
        default=[
            "botanic",
            "pest",
            "biomo",
            "legis",
            "feed",
            "partcon",
            "place",
            "vetdrug",
            "report",
            "fpurpose",
            "replev",
            "targcon",
            "feedAddExpo",
        ],
        help="List of hierarchies to be discarded by the "
        + "parser. Not indicating this assigns the default "
        + "discarded hierarchies, and using it without any "
        + "elements means no hierarchy will be discarded.",
    )
    parsexml_parser.add_argument(
        "-cb",
        "--cookbase",
        action="store_true",
        help="activate identifier generation ('_id') for " + "Cookbase platform",
    )
    parsexml_parser.set_defaults(func=parsexml)

    hierarchize_parser = subparsers.add_parser(
        "hierarchize", help="build JSON document describing a hierarchy"
    )
    hierarchize_parser.add_argument(
        "inputfolder", help="path to the directory including single term" + " files"
    )
    hierarchize_parser.add_argument("outputfile", help="path to the JSON output file")
    hierarchize_parser.add_argument(
        "hierarchycode", help="code of the hierarchy to build"
    )
    hierarchize_parser.add_argument(
        "-v", "--verbose", action="count", default=0, help="increase output verbosity"
    )
    hierarchize_parser.set_defaults(func=hierarchize)
    args = ap.parse_args()

    # checking command-line arguments correctness
    if args.command == "parsexml":
        if (args.nchunks is not None or args.single == True) and args.termsfile == None:
            ap.error(
                "-n/--nchunks and -s/--single can only be used if -t/--termsfile"
                + " is declared"
            )

    global vprint
    vprint = print if args.verbose > 0 else lambda _: None
    args.func(args)


if __name__ == "__main__":
    _main()