Access to (meta)data with user credentials: first working version :-)

0999c390 · Alessandro Cerioni · a05901ca · 0999c390 · 0999c390 · 0999c390
Commit 0999c390 authored 6 years ago by Alessandro Cerioni
--- a/1-metadata-getter.py
+++ b/1-metadata-getter.py
@@ -14,7 +14,7 @@ def filter_function( x, the_uuids_to_filter_out ):
    return x['geonet:info']['uuid'] not in the_uuids_to_filter_out
 # GEONETWORK METADATA GETTER
-def get_pages( root_url, no_records_per_page, uuid=None, the_filter=None ):
+def get_pages( root_url, no_records_per_page, uuid=None, the_filter=None, username=None, password=None ):
    params = {}
@@ -38,13 +38,16 @@ def get_pages( root_url, no_records_per_page, uuid=None, the_filter=None ):
        logging.debug("Get metadata pages, from record no. %s to record no. %s." % (params['from'],params['to']))
-        res = requests.get(root_url, params=params)
+        if username != None and password != None:
+            res = requests.get(root_url, params=params, auth=(username, password))
+        else:
+            res = requests.get(root_url, params=params)
        logging.debug(res.url)
        try:
            res.json()['metadata']
        except KeyError as e:
-            raise RecordNotFound('The record with uuid=%s was not found! Are you sure that it actually exists?' % uuid)
+            raise RecordNotFound('The record with uuid=%s was not found! Are you sure that it actually exists and that you have the proper access rights?' % uuid)
        if type(res.json()['metadata']) is list:
            records = res.json()['metadata']
@@ -128,18 +131,18 @@ def main(cfg):
    uuids_to_get        = cfg['metadata_getter']['uuids_to_get']
    uuids_to_filter_out = cfg['metadata_getter']['uuids_to_filter_out']
+    username = cfg['geonetwork']['username']
-    #print(uuids_to_get)
+    password = cfg['geonetwork']['password']
    if 'all' not in uuids_to_get:
        for uuid_to_get in uuids_to_get:
-            for page in get_pages( cfg['geonetwork']['url'], cfg['geonetwork']['records_per_page'], uuid=uuid_to_get, the_filter=uuids_to_filter_out ):
+            for page in get_pages( cfg['geonetwork']['url'], cfg['geonetwork']['records_per_page'], uuid=uuid_to_get, the_filter=uuids_to_filter_out, username=username, password=password ):
                send_page(cfg['session']['id'], cfg['geonetwork']['url'], cfg['indexer']['index'], page, channel, exchange, queue_name)
    else:
-        for page in get_pages( cfg['geonetwork']['url'], cfg['geonetwork']['records_per_page'], uuid=None, the_filter=uuids_to_filter_out ):
+        for page in get_pages( cfg['geonetwork']['url'], cfg['geonetwork']['records_per_page'], uuid=None, the_filter=uuids_to_filter_out, username=username, password=password ):
            send_page(cfg['session']['id'], cfg['geonetwork']['url'], cfg['indexer']['index'], page, channel, exchange, queue_name)

--- a/2-metadata-processor.py
+++ b/2-metadata-processor.py
@@ -122,7 +122,7 @@ def list_to_dictlist( the_input, the_context=None ):
    return the_output
-def process_records( in_records, geonetwork_root_url, working_directory ):
+def process_records( in_records, geonetwork_root_url, working_directory, credentials ):
    #print( in_records[0].keys() )
@@ -148,7 +148,7 @@ def process_records( in_records, geonetwork_root_url, working_directory ):
            #exit(1)
            del out_record['metadata-fr']['link']
            tmp = list_to_dictlist(in_record['link'], 'link')#links
-            out_record['metadata-fr']['link'] = enrich_links( fix_links(tmp), working_directory )
+            out_record['metadata-fr']['link'] = enrich_links( fix_links(tmp, credentials), working_directory )
        if 'userinfo' in out_record['metadata-fr'].keys():
            del out_record['metadata-fr']['userinfo']
@@ -258,7 +258,7 @@ def process_page( channel, method, properties, body, **kwargs):
    page = decoded_body['body']
-    out_records = process_records( page, geonetwork_root_url, kwargs['working_directory'] )
+    out_records = process_records( page, geonetwork_root_url, kwargs['working_directory'], kwargs['credentials'] )
    #print(json.dumps(out_records[0], indent=4))
    #dispatch
@@ -342,6 +342,7 @@ def main(cfg):
    channel.queue_bind(exchange=exchange, queue=docs_to_enrich_qn, routing_key=docs_to_enrich_rk)
    working_directory =  cfg['session']['working_directory']
+    credentials = cfg['credentials']
    #logging.info('Waiting for messages...')
@@ -351,7 +352,8 @@ def main(cfg):
                                                                exchange=exchange,
                                                                docs_to_index_rk=docs_to_index_rk,
                                                                docs_to_enrich_rk=docs_to_enrich_rk,
-                                                                working_directory=working_directory),
+                                                                working_directory=working_directory,
+                                                                credentials=credentials),
                          queue=metadata_pages_to_process_qn)#, no_ack=True)
    channel.start_consuming()

--- a/3-doc-enricher.py
+++ b/3-doc-enricher.py
@@ -50,9 +50,10 @@ def get_entries_from_postgis( link, cfg, no_features_per_page=1000 ):
    return
-def get_wfs( link, offset=0, no_features_per_page=1000 ):
+def get_wfs( link, credentials, offset=0, no_features_per_page=1000 ):
    root_url = link['url']
+    print(offset, no_features_per_page)
    params = {}
    params['version'] = '2.0.0'
@@ -73,7 +74,19 @@ def get_wfs( link, offset=0, no_features_per_page=1000 ):
    params['startindex'] = offset #0 + cnt*no_features_per_page
    #params['to'] = params['from'] + no_records_per_page - 1
-    res = requests.get(root_url, params = params)
+    with_credentials = False
+    for domain in credentials:
+        if domain in link['url'] and credentials[domain]['username'] != None and credentials[domain]['password']:
+            logging.info('Found a valid credential.')
+            with_credentials = True
+            username = credentials[domain]['username']
+            password = credentials[domain]['password']
+            break
+    if with_credentials:
+        res = requests.get(root_url, params = params, auth=(username, password))
+    else:
+        res = requests.get(root_url, params = params)
    logging.debug(res.url)
@@ -123,8 +136,7 @@ def old_enrich_docs( channel, method, properties, body, **kwargs ):
    logging.info('Enriching dataset named: %s' % decoded_body['body']['metadata-fr']['title'])
-    feature_page = get_wfs(wfs_info, offset, kwargs['features_per_page'])
+    feature_page = get_wfs(wfs_info, kwargs['credentials'], offset, kwargs['features_per_page'])
-    #feature_page = get_entries_from_postgis(wfs_info, kwargs['postgis_cfg'])
    # we implement pagination by letting this program creating tasks for itself / its siblings
    if feature_page != None and len(feature_page) == kwargs['features_per_page']: # at least another page is needed

--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ N.B.: Steps 6-12 can also be performed at the same time, in separate terminals.
 ## TODO
-* implementing the authenticated access to (meta)data sources; extracting a small sample of restricted access datasets out of the "full" documents, to be used as a teaser for the not-yet-authorized user
+* extracting a small sample of restricted access datasets out of the "full" documents, to be used as a teaser for the not-yet-authorized user
 * incremental updates
 * the field type detection takes a lot of time: can it be optimized?
 * logging, reporting

--- a/utils/fix_links.py
+++ b/utils/fix_links.py
@@ -2,6 +2,8 @@ import re
 from pprint import pprint
 import json
 import requests
+import urllib.request
+import base64
 from .my_logging import logging
 #import logging
@@ -59,7 +61,7 @@ def protocol_to_formats_and_services( links ):
    return output
-def fix_links( links ):
+def fix_links( links, credentials ):
    fixed_links = links.copy()
@@ -81,9 +83,30 @@ def fix_links( links ):
        # FIX links in which the declared protocol is as bizarre as "WWW:LINK-1.0-http--link"
        # The KML protocol needs also to be fixed.
        if 'protocol' in link.keys() and any( [x in link['protocol'] for x in ['WWW', 'kml', 'html', 'null'] ] ):
+            print()
+            print(link['url'])
            try:
                # let's try getting the information from the Web Server...
-                resp = requests.head( link['url'], allow_redirects=True )
+                with_credentials = False
+                for domain in credentials:
+                    if domain in link['url'] and credentials[domain]['username'] != None and credentials[domain]['password']:
+                        logging.info('Found a valid credential.')
+                        with_credentials = True
+                        username = credentials[domain]['username']
+                        password = credentials[domain]['password']
+                        break
+                if with_credentials:
+                    # N.B.:
+                    # when used with Basic Auth, the requests library strips the Content-Length header from the response;
+                    # that's why we use urllib.request here...
+                    req = urllib.request.Request( link['url'], method="HEAD" )
+                    base64string = base64.b64encode(("%s:%s" % (username, password)).encode('ascii'))
+                    req.add_header("Authorization", "Basic %s" % base64string.decode('ascii'))
+                    resp = urllib.request.urlopen(req)
+                else:
+                    resp = requests.head( link['url'], allow_redirects=True )
                # the presence of the content-length assures that the Web Server knows what it is talking about,
                # that is why we include the following line in this try-except block, except for HTML pages, in which case
                # webservers do not send back the content-length