Skip to content
Snippets Groups Projects
fix_links.py 10.1 KiB
Newer Older
  • Learn to ignore specific revisions
  • Alessandro Cerioni's avatar
    Alessandro Cerioni committed
    import re
    from pprint import pprint
    import json
    import requests
    from .my_logging import logging
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
    #import logging
    
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
    
    def translate_content_type( content_type ):
    
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
        # TODO: complete the following list!
        types = ['pdf', 'html', 'zip', 'xml', 'javascript', 'json', 'csv']
    
    
        for the_type in types:
            if the_type in content_type:
                output = the_type.upper()
                break
    
    
        if content_type == 'application/vnd.oasis.opendocument.spreadsheet':
            output = 'ODS'
    
        if content_type == 'application/vnd.ms-excel':
            output = 'XLS'
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
    def protocol_to_formats_and_services( links ):
    
        output = links.copy()
    
        for k, link in enumerate(links):
    
            if link['protocol'] == 'OGC:WMS':
                output[k]['formats'] = ['PNG', 'JPEG']
                output[k]['service'] = 'WMS'
            elif link['protocol'] == 'OGC:WFS':
                output[k]['formats'] = ['GML', 'GeoJSON', 'ShapeFile']
                output[k]['service'] = 'WFS'
            elif link['protocol'] == 'OGC:WCS':
                output[k]['formats'] = ['TIFF']
                output[k]['service'] = 'WCS'
            elif link['protocol'] == 'KML':
                output[k]['formats'] = ['KML']
                output[k]['service'] = 'KML'
            elif link['protocol'] == 'WS':
                output[k]['formats'] = ['JSON', 'ShapeFile']
                output[k]['service'] = 'WS'
            elif link['protocol'] == 'SOS':
                output[k]['formats'] = ['JSON', 'XML']
                output[k]['service'] = 'SOS'
            else:
                output[k]['formats'] = [ link['protocol'] ]
    
            del output[k]['protocol']
    
        return output
    
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
    
    def fix_links( links ):
    
        fixed_links = links.copy()
    
        # the 'protocol' attribute is used, today in a rather meaningless way; let's try improving it... (WWW-LINK -> PDF, ...)
        for k, link in enumerate(links):
    
    
            if 'unknown' in link.keys():
                del fixed_links[k]['unknown']
    
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
            # the 'description' attribute ends, sometimes, with (OGC:WMS), (OGC:WCS), ..., which is redundant and, in some case, erroneus.
            if 'description' in link.keys():
    
                fixed_links[k]['description'] = re.sub(r'\(OGC:WMS\)|\(OGC:WCS\)|\(OGC:WFS\)|\(OGC:SOS\)', '', link['description']).strip()
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
    
            # # FIX links pointing to external websites
            # #logging.debug(link)
            # if (link['protocol'] == 'WWW-LINK' and link['content-type'] == 'WWW:LINK'):
            #     fixed_links[k]['protocol'] = 'HTML'
    
            # FIX links in which the declared protocol is as bizarre as "WWW:LINK-1.0-http--link"
    
            # The KML protocol needs also to be fixed.
    
            if 'protocol' in link.keys() and any( [x in link['protocol'] for x in ['WWW', 'kml', 'html', 'null'] ] ):
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
                try:
    
                    # let's try getting the information from the Web Server...
                    resp = requests.head( link['url'], allow_redirects=True )
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
                    # the presence of the content-length assures that the Web Server knows what it is talking about,
    
                    # that is why we include the following line in this try-except block, except for HTML pages, in which case
                    # webservers do not send back the content-length
                    if 'text/html' not in resp.headers['Content-Type']:
                        fixed_links[k]['content-length'] = resp.headers['Content-Length']
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
                    # N.B.: we use the attribute 'protocol' in the output dict 'cause that's an attribute that GeoNetwork lets the user fill,
                    # as opposed to the 'content-type' attribute which is not shown by GeoNetwork's GUI, despite being exposed by the 'q' API.
                    # Ideally, in the future the 'protocol' attribute will be filled the right way by the back-office team.
                    fixed_links[k]['protocol'] = translate_content_type(resp.headers['Content-Type'])
    
                except Exception as e:
                    logging.debug(e)
                    # ...otherwise, we make a guess on the basis of the information carried by the URL
    
                    known_formats = ['ecw', 'pdf', 'zip', 'kml', 'json', 'tif', 'tiff', 'csv', 'sos']
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
    
                    for known_format in known_formats:
                        if known_format in link['url'].lower():
                            fixed_links[k]['protocol'] = known_format.upper()
                            continue
    
    
        # FIX TIF -> TIFF
        for k, link in enumerate(fixed_links):
    
            if link['protocol'] == 'TIF':
                fixed_links[k]['protocol'] = 'TIFF'
    
    
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
        # FIX WS / JSON
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
        for k, link in enumerate(fixed_links):
    
            if link['protocol'] == 'JSON':
                fixed_links[k]['protocol'] = 'WS'
    
                tmp = fixed_links[k]['name']
                fixed_links[k]['name'] = tmp.split('/')[0]
    
                tmp = links[k]['url']
                the_fixed_ws_url = '/'.join(tmp.split('/')[0:5])
                fixed_links[k]['url'] = the_fixed_ws_url
                continue
    
        # FIX KML
        for k, link in enumerate(fixed_links):
    
            if link['protocol'] == 'KML':
    
                if the_fixed_ws_url != None: # it means that the WS exists, ergo the KML service
                    fixed_links[k]['url'] = the_fixed_ws_url.replace('ws', 'kml')
                else:
                    del fixed_links[k]
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
                continue
    
    
        # FIX SOS
        for k, link in enumerate(fixed_links):
    
            if link['protocol'] == 'SOS':
                if '?' in link['url']:
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
                    fixed_links[k]['url'] = link['url'].split('?')[0]
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
        return protocol_to_formats_and_services(fixed_links)
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
    
    
    if __name__ == '__main__':
    
    
        links = [
                    {
                        'url': "https://download.data.grandlyon.com/wms/grandlyon",
                        'name': "cad_cadastre.cadbornelimitepropriete",
                        'content-type': "application/vnd.ogc.wms_xml",
                        'description': "Borne limite de propriété (Plan cadastral informatisé du Grand Lyon)(OGC:WMS)",
                        'unknown': "1",
                        'protocol': "OGC:WMS"
                    },
                    {
                        'url': "../../srv/en/google.kml?uuid=fe13ef2c-d516-4335-84ce-8f910b9388f8&layers=cad_cadastre.cadbornelimitepropriete",
                        'name': "cad_cadastre.cadbornelimitepropriete",
                        'content-type': "application/vnd.google-earth.kml+xml",
                        'description': "Borne limite de propriété (Plan cadastral informatisé du Grand Lyon)(OGC:WMS)",
                        'unknown': "1",
                        'protocol': "KML"
                    },
                    {
                        'url': "https://download.data.grandlyon.com/wfs/grandlyon",
                        'name': "cad_cadastre.cadbornelimitepropriete",
                        'content-type': "OGC:WFS",
                        'description': "Borne limite de propriété (Plan cadastral informatisé du Grand Lyon) (OGC:WFS)",
                        'unknown': "1",
                        'protocol': "OGC:WFS"
                    },
                    {
                        'url': "http://www.cadastre.gouv.fr/scpc/accueil.do",
                        'name': "cadastre.gouv.fr",
                        'content-type': "text/html",
                        'description': "Service de consultation du plan cadastral - Ministère du budget, des comptes publics, de la fonction publique et de la réforme de l'Etat",
                        'unknown': "1",
                        'protocol': "WWW:LINK-1.0-http--link"
                    },
                    {
                        'url': "https://download.data.grandlyon.com/ws/grandlyon/cad_cadastre.cadbornelimitepropriete/all.json",
                        'name': "cad_cadastre.cadbornelimitepropriete/all.json",
                        'content-type': "text/html",
                        'description': "Description des données dans le format texte JSON",
                        'unknown': "1",
                        'protocol': "WWW:LINK-1.0-http--link"
                    },
                    {
                        'url': "https://download.data.grandlyon.com/files/grandlyon/LicenceOuverte.pdf",
                        'name': "Licence ouverte",
                        'content-type': "text/html",
                        'description': "Licence ouverte",
                        'unknown': "1",
                        'protocol': "WWW-LINK"
    
                    {
                        'url': "https://download.data.grandlyon.com/files/grandlyon/LicenceOuverte.pdf",
                        'name': "Licence ouverte",
                        'content-type': "text/html",
                        'description': "Licence ouverte",
                        'unknown': "1",
                        'protocol': "null"
                    },
    
                    {
                        'name': "Historique des disponibilités des stations Vélo'V(OGC:SOS)",
                        'description': "Historique des disponibilités des stations Vélo'V(OGC:SOS)",
                        'url': "https://download.data.grandlyon.com/sos/velov?request=getCapabilities&service=SOS",
                        'protocol': "WWW:LINK-1.0-http--link",
                        'content-type': "text/html",
                        'unknown': "1"
                    },
                    {
                        'name': "Démonstrateur WMS-Time",
                        'description': "Visualisation cartographique temporelle (WMS-T) de l'historique des disponibilités des stations Vélo'V",
                        'url': "http://demo.data.grandlyon.com/wmst/reseau_velov.html",
                        'protocol': "text/html",
                        'content-type': "text/html",
                        'unknown': "1"
    
                    },
                    {
                        "name": "alerte-pollens",
                        "description": "alerte-pollens",
                        "url": "http://www.pollens.fr/alerte-pollens",
                        "protocol": "WWW-LINK",
                        "content-type": "WWW:LINK",
                        "unknown": "0"
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
                    },
                    {
                        "name": "MNT2009_ombrage_10m_CC46",
                        "description": "Ombrage du relief du Grand Lyon 2009",
                        "url": "https://download.data.grandlyon.com/wcs/grandlyon",
                        "protocol": "OGC:WCS",
                        "content-type": "OGC:WCS",
                        "unknown": "1"
    
    Alessandro Cerioni's avatar
    Alessandro Cerioni committed
                    }
            ]
    
        fixed_links = fix_links(links)
        pprint(fixed_links)