Skip to content
Snippets Groups Projects
Commit 85715e1f authored by Alessandro Cerioni's avatar Alessandro Cerioni
Browse files

Prevent HTML ressources / simple URLs to be deemed as downloadable ressources

parent 8b835820
Branches
Tags
No related merge requests found
...@@ -61,7 +61,7 @@ def protocol_to_formats_and_services( links ): ...@@ -61,7 +61,7 @@ def protocol_to_formats_and_services( links ):
return output return output
def fix_links( links, credentials ): def fix_links( links, credentials=None ):
fixed_links = links.copy() fixed_links = links.copy()
...@@ -84,7 +84,7 @@ def fix_links( links, credentials ): ...@@ -84,7 +84,7 @@ def fix_links( links, credentials ):
# The KML protocol needs also to be fixed. # The KML protocol needs also to be fixed.
if 'protocol' in link.keys() and any( [x in link['protocol'] for x in ['WWW', 'kml', 'html', 'null'] ] ): if 'protocol' in link.keys() and any( [x in link['protocol'] for x in ['WWW', 'kml', 'html', 'null'] ] ):
print() print()
print(link['url']) #print(link['url'])
try: try:
# let's try getting the information from the Web Server... # let's try getting the information from the Web Server...
with_credentials = False with_credentials = False
...@@ -96,16 +96,17 @@ def fix_links( links, credentials ): ...@@ -96,16 +96,17 @@ def fix_links( links, credentials ):
password = credentials[domain]['password'] password = credentials[domain]['password']
break break
# N.B.:
# when used with Basic Auth, the requests library strips the Content-Length header from the response;
# that's why we use urllib.request here...
req = urllib.request.Request( link['url'], method="HEAD", headers={'User-Agent': 'Mozilla/5.0'})
if with_credentials: if with_credentials:
# N.B.:
# when used with Basic Auth, the requests library strips the Content-Length header from the response;
# that's why we use urllib.request here...
req = urllib.request.Request( link['url'], method="HEAD" )
base64string = base64.b64encode(("%s:%s" % (username, password)).encode('ascii')) base64string = base64.b64encode(("%s:%s" % (username, password)).encode('ascii'))
req.add_header("Authorization", "Basic %s" % base64string.decode('ascii')) req.add_header("Authorization", "Basic %s" % base64string.decode('ascii'))
resp = urllib.request.urlopen(req)
else: resp = urllib.request.urlopen(req)
resp = requests.head( link['url'], allow_redirects=True ) resp.close()
# the presence of the content-length assures that the Web Server knows what it is talking about, # the presence of the content-length assures that the Web Server knows what it is talking about,
# that is why we include the following line in this try-except block, except for HTML pages, in which case # that is why we include the following line in this try-except block, except for HTML pages, in which case
...@@ -123,7 +124,7 @@ def fix_links( links, credentials ): ...@@ -123,7 +124,7 @@ def fix_links( links, credentials ):
known_formats = ['ecw', 'pdf', 'zip', 'kml', 'json', 'tif', 'tiff', 'csv', 'sos'] known_formats = ['ecw', 'pdf', 'zip', 'kml', 'json', 'tif', 'tiff', 'csv', 'sos']
for known_format in known_formats: for known_format in known_formats:
if known_format in link['url'].lower(): if link['url'].lower().endswith(known_format):
fixed_links[k]['protocol'] = known_format.upper() fixed_links[k]['protocol'] = known_format.upper()
continue continue
...@@ -261,6 +262,14 @@ if __name__ == '__main__': ...@@ -261,6 +262,14 @@ if __name__ == '__main__':
"protocol": "OGC:WCS", "protocol": "OGC:WCS",
"content-type": "OGC:WCS", "content-type": "OGC:WCS",
"unknown": "1" "unknown": "1"
},
{
"name": "Document cadre relatif au dispositif inter-préfectoral en cas de pics de pollution",
"description": "Document cadre relatif au dispositif inter-préfectoral en cas de pics de pollution",
"url": "http://www.prefectures-regions.gouv.fr/auvergne-rhone-alpes/content/download/35211/238621/file/5-7-2017_recueil-84-2017-096-recueil-des-actes-administratifs-special.pdf",
"protocol": "WWW:LINK-1.0-http--link",
"content-type": "text/html",
"unknown": "1"
} }
] ]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment