Commit 85715e1f authored by Alessandro CERIONI's avatar Alessandro CERIONI
Browse files

Prevent HTML ressources / simple URLs to be deemed as downloadable ressources

parent 8b835820
......@@ -61,7 +61,7 @@ def protocol_to_formats_and_services( links ):
return output
def fix_links( links, credentials ):
def fix_links( links, credentials=None ):
fixed_links = links.copy()
......@@ -84,7 +84,7 @@ def fix_links( links, credentials ):
# The KML protocol needs also to be fixed.
if 'protocol' in link.keys() and any( [x in link['protocol'] for x in ['WWW', 'kml', 'html', 'null'] ] ):
print()
print(link['url'])
#print(link['url'])
try:
# let's try getting the information from the Web Server...
with_credentials = False
......@@ -96,16 +96,17 @@ def fix_links( links, credentials ):
password = credentials[domain]['password']
break
# N.B.:
# when used with Basic Auth, the requests library strips the Content-Length header from the response;
# that's why we use urllib.request here...
req = urllib.request.Request( link['url'], method="HEAD", headers={'User-Agent': 'Mozilla/5.0'})
if with_credentials:
# N.B.:
# when used with Basic Auth, the requests library strips the Content-Length header from the response;
# that's why we use urllib.request here...
req = urllib.request.Request( link['url'], method="HEAD" )
base64string = base64.b64encode(("%s:%s" % (username, password)).encode('ascii'))
req.add_header("Authorization", "Basic %s" % base64string.decode('ascii'))
resp = urllib.request.urlopen(req)
else:
resp = requests.head( link['url'], allow_redirects=True )
resp = urllib.request.urlopen(req)
resp.close()
# the presence of the content-length assures that the Web Server knows what it is talking about,
# that is why we include the following line in this try-except block, except for HTML pages, in which case
......@@ -123,7 +124,7 @@ def fix_links( links, credentials ):
known_formats = ['ecw', 'pdf', 'zip', 'kml', 'json', 'tif', 'tiff', 'csv', 'sos']
for known_format in known_formats:
if known_format in link['url'].lower():
if link['url'].lower().endswith(known_format):
fixed_links[k]['protocol'] = known_format.upper()
continue
......@@ -261,6 +262,14 @@ if __name__ == '__main__':
"protocol": "OGC:WCS",
"content-type": "OGC:WCS",
"unknown": "1"
},
{
"name": "Document cadre relatif au dispositif inter-préfectoral en cas de pics de pollution",
"description": "Document cadre relatif au dispositif inter-préfectoral en cas de pics de pollution",
"url": "http://www.prefectures-regions.gouv.fr/auvergne-rhone-alpes/content/download/35211/238621/file/5-7-2017_recueil-84-2017-096-recueil-des-actes-administratifs-special.pdf",
"protocol": "WWW:LINK-1.0-http--link",
"content-type": "text/html",
"unknown": "1"
}
]
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment