-
Alessandro Cerioni authoredAlessandro Cerioni authored
fix_links.py 12.32 KiB
import re
from pprint import pprint
import json
import requests
import urllib.request
import base64
from .my_logging import logging
#import logging
def translate_content_type( content_type ):
output = content_type
# TODO: complete the following list!
types = ['pdf', 'html', 'zip', 'xml', 'javascript', 'json', 'csv', 'tiff']
for the_type in types:
if the_type in content_type:
output = the_type.upper()
break
if content_type == 'application/vnd.oasis.opendocument.spreadsheet':
output = 'ODS'
if content_type == 'application/vnd.ms-excel':
output = 'XLS'
return output
def protocol_to_formats_and_services( links ):
output = links.copy()
for k, link in enumerate(links):
if link['protocol'] == 'OGC:WMS':
output[k]['formats'] = ['PNG', 'JPEG']
output[k]['service'] = 'WMS'
elif link['protocol'] == 'OGC:WFS':
output[k]['formats'] = ['GML', 'GeoJSON', 'ShapeFile']
output[k]['service'] = 'WFS'
elif link['protocol'] == 'OGC:WCS':
output[k]['formats'] = ['TIFF']
output[k]['service'] = 'WCS'
elif link['protocol'] == 'KML':
output[k]['formats'] = ['KML']
output[k]['service'] = 'KML'
elif link['protocol'] == 'WS':
output[k]['formats'] = ['JSON', 'ShapeFile']
output[k]['service'] = 'WS'
elif link['protocol'] == 'SOS':
output[k]['formats'] = ['JSON', 'XML']
output[k]['service'] = 'SOS'
elif link['protocol'] == 'HTML':
# in order to prevent HTML ressources to be deemed as downloadable
pass
elif link['protocol'].startswith("WWW:"):
# in order to prevent HTML ressources to be deemed as downloadable
pass
else:
output[k]['formats'] = [ link['protocol'] ]
del output[k]['protocol']
return output
def fix_links( links, credentials=None ):
fixed_links = links.copy()
# the 'protocol' attribute is used, today in a rather meaningless way; let's try improving it... (WWW-LINK -> PDF, ...)
for k, link in enumerate(links):
if 'unknown' in link.keys():
del fixed_links[k]['unknown']
# the 'description' attribute ends, sometimes, with (OGC:WMS), (OGC:WCS), ..., which is redundant and, in some case, erroneus.
if 'description' in link.keys():
fixed_links[k]['description'] = re.sub(r'\(OGC:WMS\)|\(OGC:WCS\)|\(OGC:WFS\)|\(OGC:SOS\)', '', link['description']).strip()
# KML
if 'protocol' in link.keys() and link['protocol'] == "application/vnd.google-earth.kml+xml":
fixed_links[k]['protocol'] = 'KML'
continue
# SOS
if 'protocol' in link.keys() and '/sos/' in link['url'].lower():
fixed_links[k]['protocol'] = 'SOS'
continue
# FIX links in which the declared protocol is as bizarre as "WWW:LINK-1.0-http--link"
# The KML protocol needs also to be fixed.
if 'protocol' in link.keys() and any( [x in link['protocol'] for x in ['WWW', 'kml', 'html', 'null'] ] ):
#print()
#print(link['url'])
try:
# let's try getting the information from the Web Server...
with_credentials = False
for domain in credentials:
if domain in link['url'] and credentials[domain]['username'] != None and credentials[domain]['password']:
logging.info('Found a valid credential for %s' % link['url'])
with_credentials = True
username = credentials[domain]['username']
password = credentials[domain]['password']
break
# N.B.:
# when used with Basic Auth, the requests library strips the Content-Length header from the response;
# that's why we use urllib.request here...
req = urllib.request.Request( link['url'], method="HEAD", headers={'User-Agent': 'Mozilla/5.0'})
if with_credentials:
base64string = base64.b64encode(("%s:%s" % (username, password)).encode('ascii'))
req.add_header("Authorization", "Basic %s" % base64string.decode('ascii'))
resp = urllib.request.urlopen(req)
resp.close()
# the presence of the content-length assures that the Web Server knows what it is talking about,
# that is why we include the following line in this try-except block, except for HTML pages, in which case
# webservers do not send back the content-length
if 'text/html' not in resp.headers['Content-Type']:
fixed_links[k]['content-length'] = resp.headers['Content-Length']
# N.B.: we use the attribute 'protocol' in the output dict 'cause that's an attribute that GeoNetwork lets the user fill,
# as opposed to the 'content-type' attribute which is not shown by GeoNetwork's GUI, despite being exposed by the 'q' API.
# Ideally, in the future the 'protocol' attribute will be filled the right way by the back-office team.
fixed_links[k]['protocol'] = translate_content_type(resp.headers['Content-Type'])
except Exception as e:
logging.debug('Did not manage to HEAD %s (%s)' % (link['url'], e))
# ...otherwise, we make a guess on the basis of the information carried by the URL
known_formats = ['ecw', 'pdf', 'zip', 'json', 'tif', 'tiff', 'csv']
for known_format in known_formats:
if link['url'].lower().endswith(known_format):
fixed_links[k]['protocol'] = known_format.upper()
continue
# FIX TIF -> TIFF
for k, link in enumerate(fixed_links):
if link['protocol'] == 'TIF':
fixed_links[k]['protocol'] = 'TIFF'
# FIX WS / JSON
the_fixed_ws_url = None
for k, link in enumerate(fixed_links):
if link['protocol'] == 'JSON':
fixed_links[k]['protocol'] = 'WS'
tmp = fixed_links[k]['name']
fixed_links[k]['name'] = tmp.split('/')[0]
tmp = links[k]['url']
the_fixed_ws_url = '/'.join(tmp.split('/')[0:5])
fixed_links[k]['url'] = the_fixed_ws_url
continue
# FIX KML
for k, link in enumerate(fixed_links):
if link['protocol'] == 'KML':
if the_fixed_ws_url != None: # it means that the WS exists, ergo the KML service
fixed_links[k]['url'] = the_fixed_ws_url.replace('ws', 'kml')
else:
del fixed_links[k]
continue
# FIX SOS
for k, link in enumerate(fixed_links):
if link['protocol'] == 'SOS':
if '?' in link['url']:
fixed_links[k]['url'] = link['url'].split('?')[0]
return protocol_to_formats_and_services(fixed_links)
if __name__ == '__main__':
links = [
{
'url': "https://download.data.grandlyon.com/wms/grandlyon",
'name': "cad_cadastre.cadbornelimitepropriete",
'content-type': "application/vnd.ogc.wms_xml",
'description': "Borne limite de propriété (Plan cadastral informatisé du Grand Lyon)(OGC:WMS)",
'unknown': "1",
'protocol': "OGC:WMS"
},
{
'url': "../../srv/en/google.kml?uuid=fe13ef2c-d516-4335-84ce-8f910b9388f8&layers=cad_cadastre.cadbornelimitepropriete",
'name': "cad_cadastre.cadbornelimitepropriete",
'content-type': "application/vnd.google-earth.kml+xml",
'description': "Borne limite de propriété (Plan cadastral informatisé du Grand Lyon)(OGC:WMS)",
'unknown': "1",
'protocol': "KML"
},
{
'url': "https://download.data.grandlyon.com/wfs/grandlyon",
'name': "cad_cadastre.cadbornelimitepropriete",
'content-type': "OGC:WFS",
'description': "Borne limite de propriété (Plan cadastral informatisé du Grand Lyon) (OGC:WFS)",
'unknown': "1",
'protocol': "OGC:WFS"
},
{
'url': "http://www.cadastre.gouv.fr/scpc/accueil.do",
'name': "cadastre.gouv.fr",
'content-type': "text/html",
'description': "Service de consultation du plan cadastral - Ministère du budget, des comptes publics, de la fonction publique et de la réforme de l'Etat",
'unknown': "1",
'protocol': "WWW:LINK-1.0-http--link"
},
{
'url': "https://download.data.grandlyon.com/ws/grandlyon/cad_cadastre.cadbornelimitepropriete/all.json",
'name': "cad_cadastre.cadbornelimitepropriete/all.json",
'content-type': "text/html",
'description': "Description des données dans le format texte JSON",
'unknown': "1",
'protocol': "WWW:LINK-1.0-http--link"
},
{
'url': "https://download.data.grandlyon.com/files/grandlyon/LicenceOuverte.pdf",
'name': "Licence ouverte",
'content-type': "text/html",
'description': "Licence ouverte",
'unknown': "1",
'protocol': "WWW-LINK"
},
{
'url': "https://download.data.grandlyon.com/files/grandlyon/LicenceOuverte.pdf",
'name': "Licence ouverte",
'content-type': "text/html",
'description': "Licence ouverte",
'unknown': "1",
'protocol': "null"
},
{
'name': "Historique des disponibilités des stations Vélo'V(OGC:SOS)",
'description': "Historique des disponibilités des stations Vélo'V(OGC:SOS)",
'url': "https://download.data.grandlyon.com/sos/velov?request=getCapabilities&service=SOS",
'protocol': "WWW:LINK-1.0-http--link",
'content-type': "text/html",
'unknown': "1"
},
{
'name': "Démonstrateur WMS-Time",
'description': "Visualisation cartographique temporelle (WMS-T) de l'historique des disponibilités des stations Vélo'V",
'url': "http://demo.data.grandlyon.com/wmst/reseau_velov.html",
'protocol': "text/html",
'content-type': "text/html",
'unknown': "1"
},
{
"name": "alerte-pollens",
"description": "alerte-pollens",
"url": "http://www.pollens.fr/alerte-pollens",
"protocol": "WWW-LINK",
"content-type": "WWW:LINK",
"unknown": "0"
},
{
"name": "MNT2009_ombrage_10m_CC46",
"description": "Ombrage du relief du Grand Lyon 2009",
"url": "https://download.data.grandlyon.com/wcs/grandlyon",
"protocol": "OGC:WCS",
"content-type": "OGC:WCS",
"unknown": "1"
},
{
"name": "Document cadre relatif au dispositif inter-préfectoral en cas de pics de pollution",
"description": "Document cadre relatif au dispositif inter-préfectoral en cas de pics de pollution",
"url": "http://www.prefectures-regions.gouv.fr/auvergne-rhone-alpes/content/download/35211/238621/file/5-7-2017_recueil-84-2017-096-recueil-des-actes-administratifs-special.pdf",
"protocol": "WWW:LINK-1.0-http--link",
"content-type": "text/html",
"unknown": "1"
}
]
fixed_links = fix_links(links)
pprint(fixed_links)