diff --git a/1-metadata-getter.py b/1-metadata-getter.py index 3dff117bfbdaa17900ecd89040e63fbc138ba0c7..650e361c8ceed2bfd0fd5bc687b7ae0e9b095233 100644 --- a/1-metadata-getter.py +++ b/1-metadata-getter.py @@ -14,7 +14,7 @@ def filter_function( x, the_uuids_to_filter_out ): return x['geonet:info']['uuid'] not in the_uuids_to_filter_out # GEONETWORK METADATA GETTER -def get_pages( root_url, no_records_per_page, uuid=None, the_filter=None ): +def get_pages( root_url, no_records_per_page, uuid=None, the_filter=None, username=None, password=None ): params = {} @@ -38,13 +38,16 @@ def get_pages( root_url, no_records_per_page, uuid=None, the_filter=None ): logging.debug("Get metadata pages, from record no. %s to record no. %s." % (params['from'],params['to'])) - res = requests.get(root_url, params=params) + if username != None and password != None: + res = requests.get(root_url, params=params, auth=(username, password)) + else: + res = requests.get(root_url, params=params) logging.debug(res.url) try: res.json()['metadata'] except KeyError as e: - raise RecordNotFound('The record with uuid=%s was not found! Are you sure that it actually exists?' % uuid) + raise RecordNotFound('The record with uuid=%s was not found! Are you sure that it actually exists and that you have the proper access rights?' % uuid) if type(res.json()['metadata']) is list: records = res.json()['metadata'] @@ -128,18 +131,18 @@ def main(cfg): uuids_to_get = cfg['metadata_getter']['uuids_to_get'] uuids_to_filter_out = cfg['metadata_getter']['uuids_to_filter_out'] - - #print(uuids_to_get) + username = cfg['geonetwork']['username'] + password = cfg['geonetwork']['password'] if 'all' not in uuids_to_get: for uuid_to_get in uuids_to_get: - for page in get_pages( cfg['geonetwork']['url'], cfg['geonetwork']['records_per_page'], uuid=uuid_to_get, the_filter=uuids_to_filter_out ): + for page in get_pages( cfg['geonetwork']['url'], cfg['geonetwork']['records_per_page'], uuid=uuid_to_get, the_filter=uuids_to_filter_out, username=username, password=password ): send_page(cfg['session']['id'], cfg['geonetwork']['url'], cfg['indexer']['index'], page, channel, exchange, queue_name) else: - for page in get_pages( cfg['geonetwork']['url'], cfg['geonetwork']['records_per_page'], uuid=None, the_filter=uuids_to_filter_out ): + for page in get_pages( cfg['geonetwork']['url'], cfg['geonetwork']['records_per_page'], uuid=None, the_filter=uuids_to_filter_out, username=username, password=password ): send_page(cfg['session']['id'], cfg['geonetwork']['url'], cfg['indexer']['index'], page, channel, exchange, queue_name) diff --git a/2-metadata-processor.py b/2-metadata-processor.py index f9442f8ca9ba064d7f55c89fde4353547005d2cb..e53e95e5d21f3a5b5889875b7963894692303934 100644 --- a/2-metadata-processor.py +++ b/2-metadata-processor.py @@ -122,7 +122,7 @@ def list_to_dictlist( the_input, the_context=None ): return the_output -def process_records( in_records, geonetwork_root_url, working_directory ): +def process_records( in_records, geonetwork_root_url, working_directory, credentials ): #print( in_records[0].keys() ) @@ -148,7 +148,7 @@ def process_records( in_records, geonetwork_root_url, working_directory ): #exit(1) del out_record['metadata-fr']['link'] tmp = list_to_dictlist(in_record['link'], 'link')#links - out_record['metadata-fr']['link'] = enrich_links( fix_links(tmp), working_directory ) + out_record['metadata-fr']['link'] = enrich_links( fix_links(tmp, credentials), working_directory ) if 'userinfo' in out_record['metadata-fr'].keys(): del out_record['metadata-fr']['userinfo'] @@ -258,7 +258,7 @@ def process_page( channel, method, properties, body, **kwargs): page = decoded_body['body'] - out_records = process_records( page, geonetwork_root_url, kwargs['working_directory'] ) + out_records = process_records( page, geonetwork_root_url, kwargs['working_directory'], kwargs['credentials'] ) #print(json.dumps(out_records[0], indent=4)) #dispatch @@ -342,6 +342,7 @@ def main(cfg): channel.queue_bind(exchange=exchange, queue=docs_to_enrich_qn, routing_key=docs_to_enrich_rk) working_directory = cfg['session']['working_directory'] + credentials = cfg['credentials'] #logging.info('Waiting for messages...') @@ -351,7 +352,8 @@ def main(cfg): exchange=exchange, docs_to_index_rk=docs_to_index_rk, docs_to_enrich_rk=docs_to_enrich_rk, - working_directory=working_directory), + working_directory=working_directory, + credentials=credentials), queue=metadata_pages_to_process_qn)#, no_ack=True) channel.start_consuming() diff --git a/3-doc-enricher.py b/3-doc-enricher.py index cb3fd27a1bcffa71822357157db67844bab8a1dd..8e79a9df2fb142bb19b34d5d7dd54e5185b0e8e0 100644 --- a/3-doc-enricher.py +++ b/3-doc-enricher.py @@ -50,9 +50,10 @@ def get_entries_from_postgis( link, cfg, no_features_per_page=1000 ): return -def get_wfs( link, offset=0, no_features_per_page=1000 ): +def get_wfs( link, credentials, offset=0, no_features_per_page=1000 ): root_url = link['url'] + print(offset, no_features_per_page) params = {} params['version'] = '2.0.0' @@ -73,7 +74,19 @@ def get_wfs( link, offset=0, no_features_per_page=1000 ): params['startindex'] = offset #0 + cnt*no_features_per_page #params['to'] = params['from'] + no_records_per_page - 1 - res = requests.get(root_url, params = params) + with_credentials = False + for domain in credentials: + if domain in link['url'] and credentials[domain]['username'] != None and credentials[domain]['password']: + logging.info('Found a valid credential.') + with_credentials = True + username = credentials[domain]['username'] + password = credentials[domain]['password'] + break + + if with_credentials: + res = requests.get(root_url, params = params, auth=(username, password)) + else: + res = requests.get(root_url, params = params) logging.debug(res.url) @@ -123,8 +136,7 @@ def old_enrich_docs( channel, method, properties, body, **kwargs ): logging.info('Enriching dataset named: %s' % decoded_body['body']['metadata-fr']['title']) - feature_page = get_wfs(wfs_info, offset, kwargs['features_per_page']) - #feature_page = get_entries_from_postgis(wfs_info, kwargs['postgis_cfg']) + feature_page = get_wfs(wfs_info, kwargs['credentials'], offset, kwargs['features_per_page']) # we implement pagination by letting this program creating tasks for itself / its siblings if feature_page != None and len(feature_page) == kwargs['features_per_page']: # at least another page is needed diff --git a/README.md b/README.md index 11d31ba87e0dd477869e9927adb5e70de8b202a1..2542e58a6eb64d7327870270d64027d8bbc2687a 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ N.B.: Steps 6-12 can also be performed at the same time, in separate terminals. ## TODO -* implementing the authenticated access to (meta)data sources; extracting a small sample of restricted access datasets out of the "full" documents, to be used as a teaser for the not-yet-authorized user +* extracting a small sample of restricted access datasets out of the "full" documents, to be used as a teaser for the not-yet-authorized user * incremental updates * the field type detection takes a lot of time: can it be optimized? * logging, reporting diff --git a/utils/fix_links.py b/utils/fix_links.py index f1a6765d9d7a17351112087b3179ab4efa621e7a..423a98b08210a89c1eee8fa8640ecef41ac1420a 100644 --- a/utils/fix_links.py +++ b/utils/fix_links.py @@ -2,6 +2,8 @@ import re from pprint import pprint import json import requests +import urllib.request +import base64 from .my_logging import logging #import logging @@ -59,7 +61,7 @@ def protocol_to_formats_and_services( links ): return output -def fix_links( links ): +def fix_links( links, credentials ): fixed_links = links.copy() @@ -81,9 +83,30 @@ def fix_links( links ): # FIX links in which the declared protocol is as bizarre as "WWW:LINK-1.0-http--link" # The KML protocol needs also to be fixed. if 'protocol' in link.keys() and any( [x in link['protocol'] for x in ['WWW', 'kml', 'html', 'null'] ] ): + print() + print(link['url']) try: # let's try getting the information from the Web Server... - resp = requests.head( link['url'], allow_redirects=True ) + with_credentials = False + for domain in credentials: + if domain in link['url'] and credentials[domain]['username'] != None and credentials[domain]['password']: + logging.info('Found a valid credential.') + with_credentials = True + username = credentials[domain]['username'] + password = credentials[domain]['password'] + break + + if with_credentials: + # N.B.: + # when used with Basic Auth, the requests library strips the Content-Length header from the response; + # that's why we use urllib.request here... + req = urllib.request.Request( link['url'], method="HEAD" ) + base64string = base64.b64encode(("%s:%s" % (username, password)).encode('ascii')) + req.add_header("Authorization", "Basic %s" % base64string.decode('ascii')) + resp = urllib.request.urlopen(req) + else: + resp = requests.head( link['url'], allow_redirects=True ) + # the presence of the content-length assures that the Web Server knows what it is talking about, # that is why we include the following line in this try-except block, except for HTML pages, in which case # webservers do not send back the content-length