Skip to content
Snippets Groups Projects
Commit eb0262f7 authored by Alessandro Cerioni's avatar Alessandro Cerioni
Browse files

Improved 'protocol' detection for HTML ressources.

parent 12783d10
Branches
Tags
No related merge requests found
...@@ -4,7 +4,6 @@ import json ...@@ -4,7 +4,6 @@ import json
import requests import requests
from .my_logging import logging from .my_logging import logging
def translate_content_type( content_type ): def translate_content_type( content_type ):
# TODO: complete the following list! # TODO: complete the following list!
...@@ -39,13 +38,15 @@ def fix_links( links ): ...@@ -39,13 +38,15 @@ def fix_links( links ):
# FIX links in which the declared protocol is as bizarre as "WWW:LINK-1.0-http--link" # FIX links in which the declared protocol is as bizarre as "WWW:LINK-1.0-http--link"
# The KML protocol needs also to be fixed. # The KML protocol needs also to be fixed.
if 'protocol' in link.keys() and any( [x in link['protocol'] for x in ['WWW', 'kml', 'html'] ] ): if 'protocol' in link.keys() and any( [x in link['protocol'] for x in ['WWW', 'kml', 'html', 'null'] ] ):
try: try:
# let's try getting the information from the Web Server... # let's try getting the information from the Web Server...
resp = requests.head( link['url'], allow_redirects=True ) resp = requests.head( link['url'], allow_redirects=True )
# the presence of the content-length assures that the Web Server knows what it is talking about, # the presence of the content-length assures that the Web Server knows what it is talking about,
# that is why we include the following line in this try-except block # that is why we include the following line in this try-except block, except for HTML pages, in which case
fixed_links[k]['content-length'] = resp.headers['Content-Length'] # webservers do not send back the content-length
if 'text/html' not in resp.headers['Content-Type']:
fixed_links[k]['content-length'] = resp.headers['Content-Length']
# N.B.: we use the attribute 'protocol' in the output dict 'cause that's an attribute that GeoNetwork lets the user fill, # N.B.: we use the attribute 'protocol' in the output dict 'cause that's an attribute that GeoNetwork lets the user fill,
# as opposed to the 'content-type' attribute which is not shown by GeoNetwork's GUI, despite being exposed by the 'q' API. # as opposed to the 'content-type' attribute which is not shown by GeoNetwork's GUI, despite being exposed by the 'q' API.
# Ideally, in the future the 'protocol' attribute will be filled the right way by the back-office team. # Ideally, in the future the 'protocol' attribute will be filled the right way by the back-office team.
...@@ -63,6 +64,13 @@ def fix_links( links ): ...@@ -63,6 +64,13 @@ def fix_links( links ):
#pprint(fixed_links) #pprint(fixed_links)
# FIX TIF -> TIFF
for k, link in enumerate(fixed_links):
if link['protocol'] == 'TIF':
fixed_links[k]['protocol'] = 'TIFF'
# FIX WS / JSON # FIX WS / JSON
the_fixed_ws_url = None the_fixed_ws_url = None
for k, link in enumerate(fixed_links): for k, link in enumerate(fixed_links):
...@@ -151,6 +159,14 @@ if __name__ == '__main__': ...@@ -151,6 +159,14 @@ if __name__ == '__main__':
'unknown': "1", 'unknown': "1",
'protocol': "WWW-LINK" 'protocol': "WWW-LINK"
}, },
{
'url': "https://download.data.grandlyon.com/files/grandlyon/LicenceOuverte.pdf",
'name': "Licence ouverte",
'content-type': "text/html",
'description': "Licence ouverte",
'unknown': "1",
'protocol': "null"
},
{ {
'name': "Historique des disponibilités des stations Vélo'V(OGC:SOS)", 'name': "Historique des disponibilités des stations Vélo'V(OGC:SOS)",
'description': "Historique des disponibilités des stations Vélo'V(OGC:SOS)", 'description': "Historique des disponibilités des stations Vélo'V(OGC:SOS)",
...@@ -166,6 +182,14 @@ if __name__ == '__main__': ...@@ -166,6 +182,14 @@ if __name__ == '__main__':
'protocol': "text/html", 'protocol': "text/html",
'content-type': "text/html", 'content-type': "text/html",
'unknown': "1" 'unknown': "1"
},
{
"name": "alerte-pollens",
"description": "alerte-pollens",
"url": "http://www.pollens.fr/alerte-pollens",
"protocol": "WWW-LINK",
"content-type": "WWW:LINK",
"unknown": "0"
} }
] ]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment