Newer
Older
import re
from pprint import pprint
import json
import requests
import urllib.request
import base64
Alessandro Cerioni
committed
output = content_type
Alessandro Cerioni
committed
types = ['pdf', 'html', 'zip', 'xml', 'javascript', 'json', 'csv', 'tiff']
Alessandro Cerioni
committed
for the_type in types:
if the_type in content_type:
output = the_type.upper()
break
Alessandro Cerioni
committed
if content_type == 'application/vnd.oasis.opendocument.spreadsheet':
output = 'ODS'
if content_type == 'application/vnd.ms-excel':
output = 'XLS'
Alessandro Cerioni
committed
return output
def protocol_to_formats_and_services( links ):
output = links.copy()
for k, link in enumerate(links):
output[k]['formats'] = ['PNG', 'JPEG']
output[k]['service'] = 'WMS'
elif link['protocol'] == 'OGC:WFS':
output[k]['formats'] = ['GML', 'GeoJSON', 'ShapeFile']
output[k]['service'] = 'WFS'
elif link['protocol'] == 'OGC:WCS':
output[k]['formats'] = ['TIFF']
output[k]['service'] = 'WCS'
elif link['protocol'] == 'KML':
output[k]['formats'] = ['KML']
output[k]['service'] = 'KML'
elif link['protocol'] == 'WS':
output[k]['service'] = 'WS'
elif link['protocol'] == 'SOS':
output[k]['formats'] = ['JSON', 'XML']
output[k]['service'] = 'SOS'
Alessandro Cerioni
committed
elif link['protocol'] == 'HTML':
# in order to prevent HTML ressources to be deemed as downloadable
pass
Alessandro Cerioni
committed
elif link['protocol'].startswith("WWW:") or link['protocol'].startswith("ESRI:"):
# in order to prevent some ressources to be deemed as downloadable
Alessandro Cerioni
committed
pass
else:
output[k]['formats'] = [ link['protocol'] ]
del output[k]['protocol']
return output
Alessandro Cerioni
committed
def fix_links( links, credentials=None ):
fixed_links = links.copy()
# the 'protocol' attribute is used, today in a rather meaningless way; let's try improving it... (WWW-LINK -> PDF, ...)
for k, link in enumerate(links):
if 'unknown' in link.keys():
del fixed_links[k]['unknown']
# the 'description' attribute ends, sometimes, with (OGC:WMS), (OGC:WCS), ..., which is redundant and, in some case, erroneus.
if 'description' in link.keys():
Alessandro Cerioni
committed
fixed_links[k]['description'] = re.sub(r'\(OGC:WMS\)|\(OGC:WCS\)|\(OGC:WFS\)|\(OGC:SOS\)', '', link['description']).strip()
# KML
if 'protocol' in link.keys() and link['protocol'] == "application/vnd.google-earth.kml+xml":
fixed_links[k]['protocol'] = 'KML'
continue
# SOS
if 'protocol' in link.keys() and '/sos/' in link['url'].lower():
fixed_links[k]['protocol'] = 'SOS'
continue
# FIX links in which the declared protocol is as bizarre as "WWW:LINK-1.0-http--link"
# The KML protocol needs also to be fixed.
if 'protocol' in link.keys() and any( [x in link['protocol'] for x in ['WWW', 'kml', 'html', 'null'] ] ):
Alessandro Cerioni
committed
#print()
Alessandro Cerioni
committed
#print(link['url'])
# let's try getting the information from the Web Server...
with_credentials = False
for domain in credentials:
if domain in link['url'] and credentials[domain]['username'] != None and credentials[domain]['password']:
logging.info('Found a valid credential for %s' % link['url'])
with_credentials = True
username = credentials[domain]['username']
password = credentials[domain]['password']
break
Alessandro Cerioni
committed
# N.B.:
# when used with Basic Auth, the requests library strips the Content-Length header from the response;
# that's why we use urllib.request here...
req = urllib.request.Request( link['url'].replace(" ", "%20"), method="HEAD", headers={'User-Agent': 'Mozilla/5.0'})
Alessandro Cerioni
committed
if with_credentials:
base64string = base64.b64encode(("%s:%s" % (username, password)).encode('ascii'))
req.add_header("Authorization", "Basic %s" % base64string.decode('ascii'))
Alessandro Cerioni
committed
resp = urllib.request.urlopen(req)
resp.close()
# the presence of the content-length assures that the Web Server knows what it is talking about,
# that is why we include the following line in this try-except block, except for HTML pages, in which case
# webservers do not send back the content-length
if 'text/html' not in resp.headers['Content-Type']:
fixed_links[k]['content-length'] = resp.headers['Content-Length']
# N.B.: we use the attribute 'protocol' in the output dict 'cause that's an attribute that GeoNetwork lets the user fill,
# as opposed to the 'content-type' attribute which is not shown by GeoNetwork's GUI, despite being exposed by the 'q' API.
# Ideally, in the future the 'protocol' attribute will be filled the right way by the back-office team.
fixed_links[k]['protocol'] = translate_content_type(resp.headers['Content-Type'])
except Exception as e:
logging.debug('Did not manage to HEAD %s (%s)' % (link['url'], e))
Alessandro Cerioni
committed
# ...otherwise, we make a guess on the basis of the information carried by the URL
Alessandro Cerioni
committed
known_formats = ['ecw', 'pdf', 'zip', 'json', 'tif', 'tiff', 'csv']
Alessandro Cerioni
committed
if link['url'].lower().endswith(known_format):
fixed_links[k]['protocol'] = known_format.upper()
continue
# FIX TIF -> TIFF
for k, link in enumerate(fixed_links):
if link['protocol'] == 'TIF':
fixed_links[k]['protocol'] = 'TIFF'
Alessandro Cerioni
committed
the_fixed_ws_url = None
if link['protocol'] == 'JSON' and ('download.data.grandlyon.com/ws' in link['url'] or 'download.recette.data.grandlyon.com/ws' in link['url']):
fixed_links[k]['protocol'] = 'WS'
tmp = links[k]['url']
Alessandro Cerioni
committed
fixed_links[k]['name'] = tmp.split('/')[-2] # second to last element
#tmp = links[k]['url']
the_fixed_ws_url = '/'.join(tmp.split('/')[0:5])
fixed_links[k]['url'] = the_fixed_ws_url
continue
# FIX KML
for k, link in enumerate(fixed_links):
if link['protocol'] == 'KML':
Alessandro Cerioni
committed
if the_fixed_ws_url != None: # it means that the WS exists, ergo the KML service
fixed_links[k]['url'] = the_fixed_ws_url.replace('ws', 'kml')
else:
del fixed_links[k]
Alessandro Cerioni
committed
# FIX SOS
for k, link in enumerate(fixed_links):
if link['protocol'] == 'SOS':
if '?' in link['url']:
Alessandro Cerioni
committed
return protocol_to_formats_and_services(fixed_links)
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
if __name__ == '__main__':
links = [
{
'url': "https://download.data.grandlyon.com/wms/grandlyon",
'name': "cad_cadastre.cadbornelimitepropriete",
'content-type': "application/vnd.ogc.wms_xml",
'description': "Borne limite de propriété (Plan cadastral informatisé du Grand Lyon)(OGC:WMS)",
'unknown': "1",
'protocol': "OGC:WMS"
},
{
'url': "../../srv/en/google.kml?uuid=fe13ef2c-d516-4335-84ce-8f910b9388f8&layers=cad_cadastre.cadbornelimitepropriete",
'name': "cad_cadastre.cadbornelimitepropriete",
'content-type': "application/vnd.google-earth.kml+xml",
'description': "Borne limite de propriété (Plan cadastral informatisé du Grand Lyon)(OGC:WMS)",
'unknown': "1",
'protocol': "KML"
},
{
'url': "https://download.data.grandlyon.com/wfs/grandlyon",
'name': "cad_cadastre.cadbornelimitepropriete",
'content-type': "OGC:WFS",
'description': "Borne limite de propriété (Plan cadastral informatisé du Grand Lyon) (OGC:WFS)",
'unknown': "1",
'protocol': "OGC:WFS"
},
{
'url': "http://www.cadastre.gouv.fr/scpc/accueil.do",
'name': "cadastre.gouv.fr",
'content-type': "text/html",
'description': "Service de consultation du plan cadastral - Ministère du budget, des comptes publics, de la fonction publique et de la réforme de l'Etat",
'unknown': "1",
'protocol': "WWW:LINK-1.0-http--link"
},
{
'url': "https://download.data.grandlyon.com/ws/grandlyon/cad_cadastre.cadbornelimitepropriete/all.json",
'name': "cad_cadastre.cadbornelimitepropriete/all.json",
'content-type': "text/html",
'description': "Description des données dans le format texte JSON",
'unknown': "1",
'protocol': "WWW:LINK-1.0-http--link"
},
{
'url': "https://download.data.grandlyon.com/files/grandlyon/LicenceOuverte.pdf",
'name': "Licence ouverte",
'content-type': "text/html",
'description': "Licence ouverte",
'unknown': "1",
'protocol': "WWW-LINK"
Alessandro Cerioni
committed
},
{
'url': "https://download.data.grandlyon.com/files/grandlyon/LicenceOuverte.pdf",
'name': "Licence ouverte",
'content-type': "text/html",
'description': "Licence ouverte",
'unknown': "1",
'protocol': "null"
},
Alessandro Cerioni
committed
{
'name': "Historique des disponibilités des stations Vélo'V(OGC:SOS)",
'description': "Historique des disponibilités des stations Vélo'V(OGC:SOS)",
'url': "https://download.data.grandlyon.com/sos/velov?request=getCapabilities&service=SOS",
'protocol': "WWW:LINK-1.0-http--link",
'content-type': "text/html",
'unknown': "1"
},
{
'name': "Démonstrateur WMS-Time",
'description': "Visualisation cartographique temporelle (WMS-T) de l'historique des disponibilités des stations Vélo'V",
'url': "http://demo.data.grandlyon.com/wmst/reseau_velov.html",
'protocol': "text/html",
'content-type': "text/html",
'unknown': "1"
},
{
"name": "alerte-pollens",
"description": "alerte-pollens",
"url": "http://www.pollens.fr/alerte-pollens",
"protocol": "WWW-LINK",
"content-type": "WWW:LINK",
"unknown": "0"
},
{
"name": "MNT2009_ombrage_10m_CC46",
"description": "Ombrage du relief du Grand Lyon 2009",
"url": "https://download.data.grandlyon.com/wcs/grandlyon",
"protocol": "OGC:WCS",
"content-type": "OGC:WCS",
"unknown": "1"
Alessandro Cerioni
committed
},
{
"name": "Document cadre relatif au dispositif inter-préfectoral en cas de pics de pollution",
"description": "Document cadre relatif au dispositif inter-préfectoral en cas de pics de pollution",
"url": "http://www.prefectures-regions.gouv.fr/auvergne-rhone-alpes/content/download/35211/238621/file/5-7-2017_recueil-84-2017-096-recueil-des-actes-administratifs-special.pdf",
"protocol": "WWW:LINK-1.0-http--link",
"content-type": "text/html",
"unknown": "1"
}
]
fixed_links = fix_links(links)
pprint(fixed_links)