Source code for isogeotodocx.utils.formatter
# -*- coding: UTF-8 -*-
# ------------------------------------------------------------------------------
# Name: Isogeo to Microsoft Word 2010
# Purpose: Get metadatas from an Isogeo share and store it into
# a Word document for each metadata. It's one of the submodules
# of isogeo2office (https://github.com/isogeo/isogeo-2-office).
#
# Author: Julien Moura (@geojulien) for Isogeo
#
# Python: 2.7.x
# Created: 14/08/2014
# Updated: 28/01/2016
# ------------------------------------------------------------------------------
# ##############################################################################
# ########## Libraries #############
# ##################################
# Standard library
import logging
import re
from itertools import zip_longest
from xml.sax.saxutils import escape # '<' -> '<'
# 3rd party library
from isogeo_pysdk import (
Condition,
Conformity,
Directive,
IsogeoTranslator,
IsogeoUtils,
Limitation,
)
# ##############################################################################
# ############ Globals ############
# #################################
logger = logging.getLogger("isogeotodocx") # LOG
utils = IsogeoUtils()
# ##############################################################################
# ########## Classes ###############
# ##################################
[docs]class Formatter(object):
"""Metadata formatter to avoid repeat operations on metadata during export in different formats.
:param str lang: selected language
"""
def __init__(self, lang="FR"):
# locale
self.lang = lang.lower()
if lang.lower() == "fr":
self.dates_fmt = "%d/%m/%Y"
self.datetimes_fmt = "%A %d %B %Y (%Hh%M)"
self.locale_fmt = "fr_FR"
else:
self.dates_fmt = "%d/%m/%Y"
self.datetimes_fmt = "%a %d %B %Y (%Hh%M)"
self.locale_fmt = "uk_UK"
# store params and imports as attributes
self.isogeo_tr = IsogeoTranslator(lang).tr
# ------------ Metadata sections formatter --------------------------------
[docs] def conditions(self, md_conditions: list) -> list:
"""Render input metadata CGUs as a new list.
:param list md_conditions: input list extracted from an Isogeo metadata
:rtype: tuple(dict)
"""
# output list
conditions_out = []
for c_in in md_conditions:
# load condition object
condition_in = Condition(**c_in)
# build out dict
condition = {}
if condition_in.description and len(condition_in.description):
condition["description"] = condition_in.description
else:
condition["description"] = self.isogeo_tr("conditions", "noLicense")
if condition_in.license:
if condition_in.license.content:
condition["description"] += "\n" + condition_in.license.content
condition["link"] = condition_in.license.link
condition["name"] = condition_in.license.name
# add to the final list
conditions_out.append(condition)
# return formatted result
return tuple(conditions_out)
[docs] def limitations(self, md_limitations: list) -> list:
"""Format input metadata limitations as a tuple of 2 tuples of dictionaries, ready to be exported:
one with limitations related to INSPIRE, one with other limitations.
:param list md_limitations: input list of metadata limitations
:rtype: tuple(tuple(dict), tuple(dict))
"""
limitations_out = []
for lim_in in md_limitations:
# load limitation object
limitation_in = Limitation(**lim_in)
# build out dict
limitation_out = {}
# fill it
limitation_out["description"] = limitation_in.description
limitation_out["restriction"] = self.isogeo_tr(
"restrictions", limitation_in.restriction
)
limitation_out["type"] = self.isogeo_tr("limitations", limitation_in.type)
# split INSPIRE / others
if limitation_in.directive:
directive = Directive(**limitation_in.directive)
limitation_out["directive"] = "{} ({})".format(
directive.name, directive.description
)
limitations_out.append(limitation_out)
# return formatted result
return tuple(limitations_out)
[docs] def specifications(self, md_specifications: list) -> list:
"""Render input metadata specifications (conformity + specification) as a new list.
:param list md_specifications: input dictionary extracted from an Isogeo metadata
:rtype: tuple(dict)
"""
# output list
specifications_out = []
for conformity in md_specifications:
# load conformity object
conf_in = Conformity(**conformity)
# build out dict
spec = {}
# translate
if conf_in.conformant is True:
spec["conformant"] = self.isogeo_tr("quality", "isConform")
else:
spec["conformant"] = self.isogeo_tr("quality", "isNotConform")
spec["name"] = conf_in.specification.name
spec["link"] = conf_in.specification.link
# publication date
if conf_in.specification.published:
spec["published"] = utils.hlpr_datetimes(
conf_in.specification.published
).strftime(self.dates_fmt)
else:
spec["published"] = ""
# append
specifications_out.append(spec)
# return formatted result
return tuple(specifications_out)
[docs] def clean_xml(self, invalid_xml: str, mode: str = "soft", substitute: str = "_"):
"""Clean string of XML invalid characters.
source: https://stackoverflow.com/a/13322581/2556577
:param str invalid_xml: xml string to clean
:param str substitute: character to use for subtistution of special chars
:param str modeaccents: mode to apply. Available options:
* soft [default]: remove chars which are not accepted in XML
* strict: remove additional chars
"""
if invalid_xml is None:
return ""
if not isinstance(invalid_xml, str):
return invalid_xml
# assumptions:
# doc = *( start_tag / end_tag / text )
# start_tag = '<' name *attr [ '/' ] '>'
# end_tag = '<' '/' name '>'
ws = r"[ \t\r\n]*" # allow ws between any token
# note: expand if necessary but the stricter the better
name = "[a-zA-Z]+"
# note: fragile against missing '"'; no "'"
attr = '{name} {ws} = {ws} "[^"]*"'
start_tag = "< {ws} {name} {ws} (?:{attr} {ws})* /? {ws} >"
end_tag = "{ws}".join(["<", "/", "{name}", ">"])
tag = "{start_tag} | {end_tag}"
assert "{{" not in tag
while "{" in tag: # unwrap definitions
tag = tag.format(**vars())
tag_regex = re.compile("(%s)" % tag, flags=re.VERBOSE)
# escape &, <, > in the text
iters = [iter(tag_regex.split(invalid_xml))] * 2
pairs = zip_longest(*iters, fillvalue="") # iterate 2 items at a time
# get the clean version
clean_version = "".join(escape(text) + tag for text, tag in pairs)
if mode == "strict":
clean_version = re.sub(r"<.*?>", substitute, clean_version)
else:
pass
return clean_version
# ###############################################################################
# ###### Stand alone program ########
# ###################################
if __name__ == "__main__":
"""Try me"""
formatter = Formatter()
# limitations
fixture_limitations = [
{
"_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n",
"type": "legal",
"description": "**Gras**\n*Italique*\t\n<del>Supprimé</del>\n<cite>Citation</cite>\n\n* Élément 1\n* Élément 2\n\n1. Élément 1\n2. Élément 2\n\n[Foo](http://foo.bar)",
"restriction": "license",
"directive": {
"_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n",
"name": "Pas de restriction d’accès public selon INSPIRE",
"description": "Aucun des articles de la loi ne peut être invoqué pour justifier d’une restriction d’accès public.",
},
},
{
"_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n",
"type": "security",
"description": "**Gras**\n*Italique*\t\n<del>Supprimé</del>\n<cite>Citation</cite>\n\n* Élément 1\n* Élément 2\n\n1. Élément 1\n2. Élément 2\n\n[Foo](http://foo.bar)",
},
{
"_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n",
"type": "legal",
"description": "",
"restriction": "other",
},
{
"_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n",
"type": "legal",
"description": "",
"restriction": "patentPending",
},
{
"_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n",
"type": "legal",
"description": "Ceci est un **copyright**",
"restriction": "copyright",
},
{
"_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n",
"type": "legal",
"description": "",
"restriction": "trademark",
},
{
"_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n",
"type": "legal",
"description": "",
"restriction": "patent",
},
{
"_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n",
"type": "legal",
"description": "",
"restriction": "intellectualPropertyRights",
},
{
"_id": "1a2b3c4d5e6f7g8h9i0j11k12l13m14n",
"type": "legal",
"description": "",
"restriction": "restricted",
},
]
print(formatter.limitations(fixture_limitations))