Newer
Older
import re
from pprint import pprint
import json
import requests
from .my_logging import logging
Alessandro Cerioni
committed
output = content_type
# TODO: complete the following list!
types = ['pdf', 'html', 'zip', 'xml', 'javascript', 'json', 'csv']
Alessandro Cerioni
committed
for the_type in types:
if the_type in content_type:
output = the_type.upper()
break
Alessandro Cerioni
committed
if content_type == 'application/vnd.oasis.opendocument.spreadsheet':
output = 'ODS'
if content_type == 'application/vnd.ms-excel':
output = 'XLS'
Alessandro Cerioni
committed
return output
def protocol_to_formats_and_services( links ):
output = links.copy()
for k, link in enumerate(links):
if link['protocol'] == 'OGC:WMS':
output[k]['formats'] = ['PNG', 'JPEG']
output[k]['service'] = 'WMS'
elif link['protocol'] == 'OGC:WFS':
output[k]['formats'] = ['GML', 'GeoJSON', 'ShapeFile']
output[k]['service'] = 'WFS'
elif link['protocol'] == 'OGC:WCS':
output[k]['formats'] = ['TIFF']
output[k]['service'] = 'WCS'
elif link['protocol'] == 'KML':
output[k]['formats'] = ['KML']
output[k]['service'] = 'KML'
elif link['protocol'] == 'WS':
output[k]['formats'] = ['JSON', 'ShapeFile']
output[k]['service'] = 'WS'
elif link['protocol'] == 'SOS':
output[k]['formats'] = ['JSON', 'XML']
output[k]['service'] = 'SOS'
else:
output[k]['formats'] = [ link['protocol'] ]
del output[k]['protocol']
return output
def fix_links( links ):
fixed_links = links.copy()
# the 'protocol' attribute is used, today in a rather meaningless way; let's try improving it... (WWW-LINK -> PDF, ...)
for k, link in enumerate(links):
if 'unknown' in link.keys():
del fixed_links[k]['unknown']
# the 'description' attribute ends, sometimes, with (OGC:WMS), (OGC:WCS), ..., which is redundant and, in some case, erroneus.
if 'description' in link.keys():
Alessandro Cerioni
committed
fixed_links[k]['description'] = re.sub(r'\(OGC:WMS\)|\(OGC:WCS\)|\(OGC:WFS\)|\(OGC:SOS\)', '', link['description']).strip()
# # FIX links pointing to external websites
# #logging.debug(link)
# if (link['protocol'] == 'WWW-LINK' and link['content-type'] == 'WWW:LINK'):
# fixed_links[k]['protocol'] = 'HTML'
# FIX links in which the declared protocol is as bizarre as "WWW:LINK-1.0-http--link"
# The KML protocol needs also to be fixed.
if 'protocol' in link.keys() and any( [x in link['protocol'] for x in ['WWW', 'kml', 'html', 'null'] ] ):
# let's try getting the information from the Web Server...
resp = requests.head( link['url'], allow_redirects=True )
# the presence of the content-length assures that the Web Server knows what it is talking about,
# that is why we include the following line in this try-except block, except for HTML pages, in which case
# webservers do not send back the content-length
if 'text/html' not in resp.headers['Content-Type']:
fixed_links[k]['content-length'] = resp.headers['Content-Length']
# N.B.: we use the attribute 'protocol' in the output dict 'cause that's an attribute that GeoNetwork lets the user fill,
# as opposed to the 'content-type' attribute which is not shown by GeoNetwork's GUI, despite being exposed by the 'q' API.
# Ideally, in the future the 'protocol' attribute will be filled the right way by the back-office team.
fixed_links[k]['protocol'] = translate_content_type(resp.headers['Content-Type'])
except Exception as e:
logging.debug(e)
# ...otherwise, we make a guess on the basis of the information carried by the URL
Alessandro Cerioni
committed
known_formats = ['ecw', 'pdf', 'zip', 'kml', 'json', 'tif', 'tiff', 'csv', 'sos']
for known_format in known_formats:
if known_format in link['url'].lower():
fixed_links[k]['protocol'] = known_format.upper()
continue
# FIX TIF -> TIFF
for k, link in enumerate(fixed_links):
if link['protocol'] == 'TIF':
fixed_links[k]['protocol'] = 'TIFF'
Alessandro Cerioni
committed
the_fixed_ws_url = None
for k, link in enumerate(fixed_links):
if link['protocol'] == 'JSON':
fixed_links[k]['protocol'] = 'WS'
tmp = fixed_links[k]['name']
fixed_links[k]['name'] = tmp.split('/')[0]
tmp = links[k]['url']
the_fixed_ws_url = '/'.join(tmp.split('/')[0:5])
fixed_links[k]['url'] = the_fixed_ws_url
continue
# FIX KML
for k, link in enumerate(fixed_links):
if link['protocol'] == 'KML':
Alessandro Cerioni
committed
if the_fixed_ws_url != None: # it means that the WS exists, ergo the KML service
fixed_links[k]['url'] = the_fixed_ws_url.replace('ws', 'kml')
else:
del fixed_links[k]
Alessandro Cerioni
committed
# FIX SOS
for k, link in enumerate(fixed_links):
if link['protocol'] == 'SOS':
if '?' in link['url']:
Alessandro Cerioni
committed
return protocol_to_formats_and_services(fixed_links)
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
if __name__ == '__main__':
links = [
{
'url': "https://download.data.grandlyon.com/wms/grandlyon",
'name': "cad_cadastre.cadbornelimitepropriete",
'content-type': "application/vnd.ogc.wms_xml",
'description': "Borne limite de propriété (Plan cadastral informatisé du Grand Lyon)(OGC:WMS)",
'unknown': "1",
'protocol': "OGC:WMS"
},
{
'url': "../../srv/en/google.kml?uuid=fe13ef2c-d516-4335-84ce-8f910b9388f8&layers=cad_cadastre.cadbornelimitepropriete",
'name': "cad_cadastre.cadbornelimitepropriete",
'content-type': "application/vnd.google-earth.kml+xml",
'description': "Borne limite de propriété (Plan cadastral informatisé du Grand Lyon)(OGC:WMS)",
'unknown': "1",
'protocol': "KML"
},
{
'url': "https://download.data.grandlyon.com/wfs/grandlyon",
'name': "cad_cadastre.cadbornelimitepropriete",
'content-type': "OGC:WFS",
'description': "Borne limite de propriété (Plan cadastral informatisé du Grand Lyon) (OGC:WFS)",
'unknown': "1",
'protocol': "OGC:WFS"
},
{
'url': "http://www.cadastre.gouv.fr/scpc/accueil.do",
'name': "cadastre.gouv.fr",
'content-type': "text/html",
'description': "Service de consultation du plan cadastral - Ministère du budget, des comptes publics, de la fonction publique et de la réforme de l'Etat",
'unknown': "1",
'protocol': "WWW:LINK-1.0-http--link"
},
{
'url': "https://download.data.grandlyon.com/ws/grandlyon/cad_cadastre.cadbornelimitepropriete/all.json",
'name': "cad_cadastre.cadbornelimitepropriete/all.json",
'content-type': "text/html",
'description': "Description des données dans le format texte JSON",
'unknown': "1",
'protocol': "WWW:LINK-1.0-http--link"
},
{
'url': "https://download.data.grandlyon.com/files/grandlyon/LicenceOuverte.pdf",
'name': "Licence ouverte",
'content-type': "text/html",
'description': "Licence ouverte",
'unknown': "1",
'protocol': "WWW-LINK"
Alessandro Cerioni
committed
},
{
'url': "https://download.data.grandlyon.com/files/grandlyon/LicenceOuverte.pdf",
'name': "Licence ouverte",
'content-type': "text/html",
'description': "Licence ouverte",
'unknown': "1",
'protocol': "null"
},
Alessandro Cerioni
committed
{
'name': "Historique des disponibilités des stations Vélo'V(OGC:SOS)",
'description': "Historique des disponibilités des stations Vélo'V(OGC:SOS)",
'url': "https://download.data.grandlyon.com/sos/velov?request=getCapabilities&service=SOS",
'protocol': "WWW:LINK-1.0-http--link",
'content-type': "text/html",
'unknown': "1"
},
{
'name': "Démonstrateur WMS-Time",
'description': "Visualisation cartographique temporelle (WMS-T) de l'historique des disponibilités des stations Vélo'V",
'url': "http://demo.data.grandlyon.com/wmst/reseau_velov.html",
'protocol': "text/html",
'content-type': "text/html",
'unknown': "1"
},
{
"name": "alerte-pollens",
"description": "alerte-pollens",
"url": "http://www.pollens.fr/alerte-pollens",
"protocol": "WWW-LINK",
"content-type": "WWW:LINK",
"unknown": "0"
},
{
"name": "MNT2009_ombrage_10m_CC46",
"description": "Ombrage du relief du Grand Lyon 2009",
"url": "https://download.data.grandlyon.com/wcs/grandlyon",
"protocol": "OGC:WCS",
"content-type": "OGC:WCS",
"unknown": "1"
}
]
fixed_links = fix_links(links)
pprint(fixed_links)