Source code for isogeotodocx.utils.formatter

# -*- coding: UTF-8 -*-

# ------------------------------------------------------------------------------
# Name:         Isogeo to Microsoft Word 2010
# Purpose:      Get metadatas from an Isogeo share and store it into
#               a Word document for each metadata. It's one of the submodules
#               of isogeo2office (https://github.com/isogeo/isogeo-2-office).
#
# Author:       Julien Moura (@geojulien) for Isogeo
#
# Python:       2.7.x
# Created:      14/08/2014
# Updated:      28/01/2016
# ------------------------------------------------------------------------------

# ##############################################################################
# ########## Libraries #############
# ##################################

# Standard library
import logging
import re
from itertools import zip_longest
from xml.sax.saxutils import escape  # '<' -> '&lt;'

# 3rd party library
from isogeo_pysdk import (
    Condition,
    Conformity,
    Directive,
    IsogeoTranslator,
    IsogeoUtils,
    Limitation,
)

# ##############################################################################
# ############ Globals ############
# #################################

logger = logging.getLogger("isogeotodocx")  # LOG
utils = IsogeoUtils()

# ##############################################################################
# ########## Classes ###############
# ##################################


[docs]class Formatter(object): """Metadata formatter to avoid repeat operations on metadata during export in different formats. :param str lang: selected language """ def __init__(self, lang="FR"): # locale self.lang = lang.lower() if lang.lower() == "fr": self.dates_fmt = "%d/%m/%Y" self.datetimes_fmt = "%A %d %B %Y (%Hh%M)" self.locale_fmt = "fr_FR" else: self.dates_fmt = "%d/%m/%Y" self.datetimes_fmt = "%a %d %B %Y (%Hh%M)" self.locale_fmt = "uk_UK" # store params and imports as attributes self.isogeo_tr = IsogeoTranslator(lang).tr # ------------ Metadata sections formatter --------------------------------
[docs] def conditions(self, md_conditions: list) -> list: """Render input metadata CGUs as a new list. :param list md_conditions: input list extracted from an Isogeo metadata :rtype: tuple(dict) """ # output list conditions_out = [] for c_in in md_conditions: # load condition object condition_in = Condition(**c_in) # build out dict condition = {} if condition_in.description and len(condition_in.description): condition["description"] = condition_in.description else: condition["description"] = self.isogeo_tr("conditions", "noLicense") if condition_in.license: if condition_in.license.content: condition["description"] += "\n" + condition_in.license.content condition["link"] = condition_in.license.link condition["name"] = condition_in.license.name # add to the final list conditions_out.append(condition) # return formatted result return tuple(conditions_out)
[docs] def limitations(self, md_limitations: list) -> list: """Format input metadata limitations as a tuple of 2 tuples of dictionaries, ready to be exported: one with limitations related to INSPIRE, one with other limitations. :param list md_limitations: input list of metadata limitations :rtype: tuple(tuple(dict), tuple(dict)) """ limitations_out = [] for lim_in in md_limitations: # load limitation object limitation_in = Limitation(**lim_in) # build out dict limitation_out = {} # fill it limitation_out["description"] = limitation_in.description limitation_out["restriction"] = self.isogeo_tr( "restrictions", limitation_in.restriction ) limitation_out["type"] = self.isogeo_tr("limitations", limitation_in.type) # split INSPIRE / others if limitation_in.directive: directive = Directive(**limitation_in.directive) limitation_out["directive"] = "{} ({})".format( directive.name, directive.description ) limitations_out.append(limitation_out) # return formatted result return tuple(limitations_out)
[docs] def specifications(self, md_specifications: list) -> list: """Render input metadata specifications (conformity + specification) as a new list. :param list md_specifications: input dictionary extracted from an Isogeo metadata :rtype: tuple(dict) """ # output list specifications_out = [] for conformity in md_specifications: # load conformity object conf_in = Conformity(**conformity) # build out dict spec = {} # translate if conf_in.conformant is True: spec["conformant"] = self.isogeo_tr("quality", "isConform") else: spec["conformant"] = self.isogeo_tr("quality", "isNotConform") spec["name"] = conf_in.specification.name spec["link"] = conf_in.specification.link # publication date if conf_in.specification.published: spec["published"] = utils.hlpr_datetimes( conf_in.specification.published ).strftime(self.dates_fmt) else: spec["published"] = "" # append specifications_out.append(spec) # return formatted result return tuple(specifications_out)
[docs] def clean_xml(self, invalid_xml: str, mode: str = "soft", substitute: str = "_"): """Clean string of XML invalid characters. source: https://stackoverflow.com/a/13322581/2556577 :param str invalid_xml: xml string to clean :param str substitute: character to use for subtistution of special chars :param str modeaccents: mode to apply. Available options: * soft [default]: remove chars which are not accepted in XML * strict: remove additional chars """ if invalid_xml is None: return "" if not isinstance(invalid_xml, str): return invalid_xml # assumptions: # doc = *( start_tag / end_tag / text ) # start_tag = '<' name *attr [ '/' ] '>' # end_tag = '<' '/' name '>' ws = r"[ \t\r\n]*" # allow ws between any token # note: expand if necessary but the stricter the better name = "[a-zA-Z]+" # note: fragile against missing '"'; no "'" attr = '{name} {ws} = {ws} "[^"]*"' start_tag = "< {ws} {name} {ws} (?:{attr} {ws})* /? {ws} >" end_tag = "{ws}".join(["<", "/", "{name}", ">"]) tag = "{start_tag} | {end_tag}" assert "{{" not in tag while "{" in tag: # unwrap definitions tag = tag.format(**vars()) tag_regex = re.compile("(%s)" % tag, flags=re.VERBOSE) # escape &, <, > in the text iters = [iter(tag_regex.split(invalid_xml))] * 2 pairs = zip_longest(*iters, fillvalue="") # iterate 2 items at a time # get the clean version clean_version = "".join(escape(text) + tag for text, tag in pairs) if mode == "strict": clean_version = re.sub(r"<.*?>", substitute, clean_version) else: pass return clean_version
# ############################################################################### # ###### Stand alone program ######## # ################################### if __name__ == "__main__": """Try me""" formatter = Formatter() # limitations fixture_limitations = [ { "_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n", "type": "legal", "description": "**Gras**\n*Italique*\t\n<del>Supprimé</del>\n<cite>Citation</cite>\n\n* Élément 1\n* Élément 2\n\n1. Élément 1\n2. Élément 2\n\n[Foo](http://foo.bar)", "restriction": "license", "directive": { "_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n", "name": "Pas de restriction d’accès public selon INSPIRE", "description": "Aucun des articles de la loi ne peut être invoqué pour justifier d’une restriction d’accès public.", }, }, { "_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n", "type": "security", "description": "**Gras**\n*Italique*\t\n<del>Supprimé</del>\n<cite>Citation</cite>\n\n* Élément 1\n* Élément 2\n\n1. Élément 1\n2. Élément 2\n\n[Foo](http://foo.bar)", }, { "_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n", "type": "legal", "description": "", "restriction": "other", }, { "_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n", "type": "legal", "description": "", "restriction": "patentPending", }, { "_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n", "type": "legal", "description": "Ceci est un **copyright**", "restriction": "copyright", }, { "_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n", "type": "legal", "description": "", "restriction": "trademark", }, { "_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n", "type": "legal", "description": "", "restriction": "patent", }, { "_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n", "type": "legal", "description": "", "restriction": "intellectualPropertyRights", }, { "_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n", "type": "legal", "description": "", "restriction": "restricted", }, ] print(formatter.limitations(fixture_limitations))