diff --git a/utils/fix_links.py b/utils/fix_links.py index 423a98b08210a89c1eee8fa8640ecef41ac1420a..0245c9a4fa8843028792807744e6ecd1d29518da 100644 --- a/utils/fix_links.py +++ b/utils/fix_links.py @@ -61,7 +61,7 @@ def protocol_to_formats_and_services( links ): return output -def fix_links( links, credentials ): +def fix_links( links, credentials=None ): fixed_links = links.copy() @@ -84,7 +84,7 @@ def fix_links( links, credentials ): # The KML protocol needs also to be fixed. if 'protocol' in link.keys() and any( [x in link['protocol'] for x in ['WWW', 'kml', 'html', 'null'] ] ): print() - print(link['url']) + #print(link['url']) try: # let's try getting the information from the Web Server... with_credentials = False @@ -96,16 +96,17 @@ def fix_links( links, credentials ): password = credentials[domain]['password'] break + # N.B.: + # when used with Basic Auth, the requests library strips the Content-Length header from the response; + # that's why we use urllib.request here... + req = urllib.request.Request( link['url'], method="HEAD", headers={'User-Agent': 'Mozilla/5.0'}) + if with_credentials: - # N.B.: - # when used with Basic Auth, the requests library strips the Content-Length header from the response; - # that's why we use urllib.request here... - req = urllib.request.Request( link['url'], method="HEAD" ) base64string = base64.b64encode(("%s:%s" % (username, password)).encode('ascii')) req.add_header("Authorization", "Basic %s" % base64string.decode('ascii')) - resp = urllib.request.urlopen(req) - else: - resp = requests.head( link['url'], allow_redirects=True ) + + resp = urllib.request.urlopen(req) + resp.close() # the presence of the content-length assures that the Web Server knows what it is talking about, # that is why we include the following line in this try-except block, except for HTML pages, in which case @@ -123,7 +124,7 @@ def fix_links( links, credentials ): known_formats = ['ecw', 'pdf', 'zip', 'kml', 'json', 'tif', 'tiff', 'csv', 'sos'] for known_format in known_formats: - if known_format in link['url'].lower(): + if link['url'].lower().endswith(known_format): fixed_links[k]['protocol'] = known_format.upper() continue @@ -261,6 +262,14 @@ if __name__ == '__main__': "protocol": "OGC:WCS", "content-type": "OGC:WCS", "unknown": "1" + }, + { + "name": "Document cadre relatif au dispositif inter-préfectoral en cas de pics de pollution", + "description": "Document cadre relatif au dispositif inter-préfectoral en cas de pics de pollution", + "url": "http://www.prefectures-regions.gouv.fr/auvergne-rhone-alpes/content/download/35211/238621/file/5-7-2017_recueil-84-2017-096-recueil-des-actes-administratifs-special.pdf", + "protocol": "WWW:LINK-1.0-http--link", + "content-type": "text/html", + "unknown": "1" } ]