Skip to content
Snippets Groups Projects
fix_links.py 12.32 KiB
import re
from pprint import pprint
import json
import requests
import urllib.request
import base64
from .my_logging import logging
#import logging


def translate_content_type( content_type ):

    output = content_type

    # TODO: complete the following list!
    types = ['pdf', 'html', 'zip', 'xml', 'javascript', 'json', 'csv', 'tiff']

    for the_type in types:
        if the_type in content_type:
            output = the_type.upper()
            break

    if content_type == 'application/vnd.oasis.opendocument.spreadsheet':
        output = 'ODS'

    if content_type == 'application/vnd.ms-excel':
        output = 'XLS'

    return output


def protocol_to_formats_and_services( links ):

    output = links.copy()

    for k, link in enumerate(links):

        if link['protocol'] == 'OGC:WMS':
            output[k]['formats'] = ['PNG', 'JPEG']
            output[k]['service'] = 'WMS'
        elif link['protocol'] == 'OGC:WFS':
            output[k]['formats'] = ['GML', 'GeoJSON', 'ShapeFile']
            output[k]['service'] = 'WFS'
        elif link['protocol'] == 'OGC:WCS':
            output[k]['formats'] = ['TIFF']
            output[k]['service'] = 'WCS'
        elif link['protocol'] == 'KML':
            output[k]['formats'] = ['KML']
            output[k]['service'] = 'KML'
        elif link['protocol'] == 'WS':
            output[k]['formats'] = ['JSON', 'ShapeFile']
            output[k]['service'] = 'WS'
        elif link['protocol'] == 'SOS':
            output[k]['formats'] = ['JSON', 'XML']
            output[k]['service'] = 'SOS'
        elif link['protocol'] == 'HTML':
            # in order to prevent HTML ressources to be deemed as downloadable
            pass
        elif link['protocol'].startswith("WWW:"):
            # in order to prevent HTML ressources to be deemed as downloadable
            pass
        else:
            output[k]['formats'] = [ link['protocol'] ]

        del output[k]['protocol']

    return output


def fix_links( links, credentials=None ):

    fixed_links = links.copy()

    # the 'protocol' attribute is used, today in a rather meaningless way; let's try improving it... (WWW-LINK -> PDF, ...)
    for k, link in enumerate(links):

        if 'unknown' in link.keys():
            del fixed_links[k]['unknown']

        # the 'description' attribute ends, sometimes, with (OGC:WMS), (OGC:WCS), ..., which is redundant and, in some case, erroneus.
        if 'description' in link.keys():
            fixed_links[k]['description'] = re.sub(r'\(OGC:WMS\)|\(OGC:WCS\)|\(OGC:WFS\)|\(OGC:SOS\)', '', link['description']).strip()

        # KML
        if 'protocol' in link.keys() and link['protocol'] == "application/vnd.google-earth.kml+xml":
            fixed_links[k]['protocol'] = 'KML'
            continue

        # SOS
        if 'protocol' in link.keys() and '/sos/' in link['url'].lower():
            fixed_links[k]['protocol'] = 'SOS'
            continue

        # FIX links in which the declared protocol is as bizarre as "WWW:LINK-1.0-http--link"
        # The KML protocol needs also to be fixed.
        if 'protocol' in link.keys() and any( [x in link['protocol'] for x in ['WWW', 'kml', 'html', 'null'] ] ):
            #print()
            #print(link['url'])
            try:
                # let's try getting the information from the Web Server...
                with_credentials = False
                for domain in credentials:
                    if domain in link['url'] and credentials[domain]['username'] != None and credentials[domain]['password']:
                        logging.info('Found a valid credential for %s' % link['url'])
                        with_credentials = True
                        username = credentials[domain]['username']
                        password = credentials[domain]['password']
                        break

                # N.B.:
                # when used with Basic Auth, the requests library strips the Content-Length header from the response;
                # that's why we use urllib.request here...
                req = urllib.request.Request( link['url'], method="HEAD", headers={'User-Agent': 'Mozilla/5.0'})

                if with_credentials:
                    base64string = base64.b64encode(("%s:%s" % (username, password)).encode('ascii'))
                    req.add_header("Authorization", "Basic %s" % base64string.decode('ascii'))

                resp = urllib.request.urlopen(req)
                resp.close()

                # the presence of the content-length assures that the Web Server knows what it is talking about,
                # that is why we include the following line in this try-except block, except for HTML pages, in which case
                # webservers do not send back the content-length
                if 'text/html' not in resp.headers['Content-Type']:
                    fixed_links[k]['content-length'] = resp.headers['Content-Length']
                # N.B.: we use the attribute 'protocol' in the output dict 'cause that's an attribute that GeoNetwork lets the user fill,
                # as opposed to the 'content-type' attribute which is not shown by GeoNetwork's GUI, despite being exposed by the 'q' API.
                # Ideally, in the future the 'protocol' attribute will be filled the right way by the back-office team.
                fixed_links[k]['protocol'] = translate_content_type(resp.headers['Content-Type'])

            except Exception as e:
                logging.debug('Did not manage to HEAD %s (%s)' % (link['url'], e))

                # ...otherwise, we make a guess on the basis of the information carried by the URL
                known_formats = ['ecw', 'pdf', 'zip', 'json', 'tif', 'tiff', 'csv']

                for known_format in known_formats:
                    if link['url'].lower().endswith(known_format):
                        fixed_links[k]['protocol'] = known_format.upper()
                        continue

    # FIX TIF -> TIFF
    for k, link in enumerate(fixed_links):

        if link['protocol'] == 'TIF':
            fixed_links[k]['protocol'] = 'TIFF'


    # FIX WS / JSON
    the_fixed_ws_url = None
    for k, link in enumerate(fixed_links):

        if link['protocol'] == 'JSON':
            fixed_links[k]['protocol'] = 'WS'

            tmp = fixed_links[k]['name']
            fixed_links[k]['name'] = tmp.split('/')[0]

            tmp = links[k]['url']
            the_fixed_ws_url = '/'.join(tmp.split('/')[0:5])
            fixed_links[k]['url'] = the_fixed_ws_url
            continue

    # FIX KML
    for k, link in enumerate(fixed_links):

        if link['protocol'] == 'KML':
            if the_fixed_ws_url != None: # it means that the WS exists, ergo the KML service
                fixed_links[k]['url'] = the_fixed_ws_url.replace('ws', 'kml')
            else:
                del fixed_links[k]
            continue

    # FIX SOS
    for k, link in enumerate(fixed_links):

        if link['protocol'] == 'SOS':
            if '?' in link['url']:
                fixed_links[k]['url'] = link['url'].split('?')[0]


    return protocol_to_formats_and_services(fixed_links)


if __name__ == '__main__':


    links = [
                {
                    'url': "https://download.data.grandlyon.com/wms/grandlyon",
                    'name': "cad_cadastre.cadbornelimitepropriete",
                    'content-type': "application/vnd.ogc.wms_xml",
                    'description': "Borne limite de propriété (Plan cadastral informatisé du Grand Lyon)(OGC:WMS)",
                    'unknown': "1",
                    'protocol': "OGC:WMS"
                },
                {
                    'url': "../../srv/en/google.kml?uuid=fe13ef2c-d516-4335-84ce-8f910b9388f8&layers=cad_cadastre.cadbornelimitepropriete",
                    'name': "cad_cadastre.cadbornelimitepropriete",
                    'content-type': "application/vnd.google-earth.kml+xml",
                    'description': "Borne limite de propriété (Plan cadastral informatisé du Grand Lyon)(OGC:WMS)",
                    'unknown': "1",
                    'protocol': "KML"
                },
                {
                    'url': "https://download.data.grandlyon.com/wfs/grandlyon",
                    'name': "cad_cadastre.cadbornelimitepropriete",
                    'content-type': "OGC:WFS",
                    'description': "Borne limite de propriété (Plan cadastral informatisé du Grand Lyon) (OGC:WFS)",
                    'unknown': "1",
                    'protocol': "OGC:WFS"
                },
                {
                    'url': "http://www.cadastre.gouv.fr/scpc/accueil.do",
                    'name': "cadastre.gouv.fr",
                    'content-type': "text/html",
                    'description': "Service de consultation du plan cadastral - Ministère du budget, des comptes publics, de la fonction publique et de la réforme de l'Etat",
                    'unknown': "1",
                    'protocol': "WWW:LINK-1.0-http--link"
                },
                {
                    'url': "https://download.data.grandlyon.com/ws/grandlyon/cad_cadastre.cadbornelimitepropriete/all.json",
                    'name': "cad_cadastre.cadbornelimitepropriete/all.json",
                    'content-type': "text/html",
                    'description': "Description des données dans le format texte JSON",
                    'unknown': "1",
                    'protocol': "WWW:LINK-1.0-http--link"
                },
                {
                    'url': "https://download.data.grandlyon.com/files/grandlyon/LicenceOuverte.pdf",
                    'name': "Licence ouverte",
                    'content-type': "text/html",
                    'description': "Licence ouverte",
                    'unknown': "1",
                    'protocol': "WWW-LINK"
                },
                {
                    'url': "https://download.data.grandlyon.com/files/grandlyon/LicenceOuverte.pdf",
                    'name': "Licence ouverte",
                    'content-type': "text/html",
                    'description': "Licence ouverte",
                    'unknown': "1",
                    'protocol': "null"
                },
                {
                    'name': "Historique des disponibilités des stations Vélo'V(OGC:SOS)",
                    'description': "Historique des disponibilités des stations Vélo'V(OGC:SOS)",
                    'url': "https://download.data.grandlyon.com/sos/velov?request=getCapabilities&service=SOS",
                    'protocol': "WWW:LINK-1.0-http--link",
                    'content-type': "text/html",
                    'unknown': "1"
                },
                {
                    'name': "Démonstrateur WMS-Time",
                    'description': "Visualisation cartographique temporelle (WMS-T) de l'historique des disponibilités des stations Vélo'V",
                    'url': "http://demo.data.grandlyon.com/wmst/reseau_velov.html",
                    'protocol': "text/html",
                    'content-type': "text/html",
                    'unknown': "1"
                },
                {
                    "name": "alerte-pollens",
                    "description": "alerte-pollens",
                    "url": "http://www.pollens.fr/alerte-pollens",
                    "protocol": "WWW-LINK",
                    "content-type": "WWW:LINK",
                    "unknown": "0"
                },
                {
                    "name": "MNT2009_ombrage_10m_CC46",
                    "description": "Ombrage du relief du Grand Lyon 2009",
                    "url": "https://download.data.grandlyon.com/wcs/grandlyon",
                    "protocol": "OGC:WCS",
                    "content-type": "OGC:WCS",
                    "unknown": "1"
                },
                {
                    "name": "Document cadre relatif au dispositif inter-préfectoral en cas de pics de pollution",
                    "description": "Document cadre relatif au dispositif inter-préfectoral en cas de pics de pollution",
                    "url": "http://www.prefectures-regions.gouv.fr/auvergne-rhone-alpes/content/download/35211/238621/file/5-7-2017_recueil-84-2017-096-recueil-des-actes-administratifs-special.pdf",
                    "protocol": "WWW:LINK-1.0-http--link",
                    "content-type": "text/html",
                    "unknown": "1"
                }
        ]

    fixed_links = fix_links(links)
    pprint(fixed_links)