Skip to content
Snippets Groups Projects
Commit 0999c390 authored by Alessandro Cerioni's avatar Alessandro Cerioni
Browse files

Access to (meta)data with user credentials: first working version :-)

parent a05901ca
Branches
Tags
No related merge requests found
...@@ -14,7 +14,7 @@ def filter_function( x, the_uuids_to_filter_out ): ...@@ -14,7 +14,7 @@ def filter_function( x, the_uuids_to_filter_out ):
return x['geonet:info']['uuid'] not in the_uuids_to_filter_out return x['geonet:info']['uuid'] not in the_uuids_to_filter_out
# GEONETWORK METADATA GETTER # GEONETWORK METADATA GETTER
def get_pages( root_url, no_records_per_page, uuid=None, the_filter=None ): def get_pages( root_url, no_records_per_page, uuid=None, the_filter=None, username=None, password=None ):
params = {} params = {}
...@@ -38,13 +38,16 @@ def get_pages( root_url, no_records_per_page, uuid=None, the_filter=None ): ...@@ -38,13 +38,16 @@ def get_pages( root_url, no_records_per_page, uuid=None, the_filter=None ):
logging.debug("Get metadata pages, from record no. %s to record no. %s." % (params['from'],params['to'])) logging.debug("Get metadata pages, from record no. %s to record no. %s." % (params['from'],params['to']))
res = requests.get(root_url, params=params) if username != None and password != None:
res = requests.get(root_url, params=params, auth=(username, password))
else:
res = requests.get(root_url, params=params)
logging.debug(res.url) logging.debug(res.url)
try: try:
res.json()['metadata'] res.json()['metadata']
except KeyError as e: except KeyError as e:
raise RecordNotFound('The record with uuid=%s was not found! Are you sure that it actually exists?' % uuid) raise RecordNotFound('The record with uuid=%s was not found! Are you sure that it actually exists and that you have the proper access rights?' % uuid)
if type(res.json()['metadata']) is list: if type(res.json()['metadata']) is list:
records = res.json()['metadata'] records = res.json()['metadata']
...@@ -128,18 +131,18 @@ def main(cfg): ...@@ -128,18 +131,18 @@ def main(cfg):
uuids_to_get = cfg['metadata_getter']['uuids_to_get'] uuids_to_get = cfg['metadata_getter']['uuids_to_get']
uuids_to_filter_out = cfg['metadata_getter']['uuids_to_filter_out'] uuids_to_filter_out = cfg['metadata_getter']['uuids_to_filter_out']
username = cfg['geonetwork']['username']
#print(uuids_to_get) password = cfg['geonetwork']['password']
if 'all' not in uuids_to_get: if 'all' not in uuids_to_get:
for uuid_to_get in uuids_to_get: for uuid_to_get in uuids_to_get:
for page in get_pages( cfg['geonetwork']['url'], cfg['geonetwork']['records_per_page'], uuid=uuid_to_get, the_filter=uuids_to_filter_out ): for page in get_pages( cfg['geonetwork']['url'], cfg['geonetwork']['records_per_page'], uuid=uuid_to_get, the_filter=uuids_to_filter_out, username=username, password=password ):
send_page(cfg['session']['id'], cfg['geonetwork']['url'], cfg['indexer']['index'], page, channel, exchange, queue_name) send_page(cfg['session']['id'], cfg['geonetwork']['url'], cfg['indexer']['index'], page, channel, exchange, queue_name)
else: else:
for page in get_pages( cfg['geonetwork']['url'], cfg['geonetwork']['records_per_page'], uuid=None, the_filter=uuids_to_filter_out ): for page in get_pages( cfg['geonetwork']['url'], cfg['geonetwork']['records_per_page'], uuid=None, the_filter=uuids_to_filter_out, username=username, password=password ):
send_page(cfg['session']['id'], cfg['geonetwork']['url'], cfg['indexer']['index'], page, channel, exchange, queue_name) send_page(cfg['session']['id'], cfg['geonetwork']['url'], cfg['indexer']['index'], page, channel, exchange, queue_name)
......
...@@ -122,7 +122,7 @@ def list_to_dictlist( the_input, the_context=None ): ...@@ -122,7 +122,7 @@ def list_to_dictlist( the_input, the_context=None ):
return the_output return the_output
def process_records( in_records, geonetwork_root_url, working_directory ): def process_records( in_records, geonetwork_root_url, working_directory, credentials ):
#print( in_records[0].keys() ) #print( in_records[0].keys() )
...@@ -148,7 +148,7 @@ def process_records( in_records, geonetwork_root_url, working_directory ): ...@@ -148,7 +148,7 @@ def process_records( in_records, geonetwork_root_url, working_directory ):
#exit(1) #exit(1)
del out_record['metadata-fr']['link'] del out_record['metadata-fr']['link']
tmp = list_to_dictlist(in_record['link'], 'link')#links tmp = list_to_dictlist(in_record['link'], 'link')#links
out_record['metadata-fr']['link'] = enrich_links( fix_links(tmp), working_directory ) out_record['metadata-fr']['link'] = enrich_links( fix_links(tmp, credentials), working_directory )
if 'userinfo' in out_record['metadata-fr'].keys(): if 'userinfo' in out_record['metadata-fr'].keys():
del out_record['metadata-fr']['userinfo'] del out_record['metadata-fr']['userinfo']
...@@ -258,7 +258,7 @@ def process_page( channel, method, properties, body, **kwargs): ...@@ -258,7 +258,7 @@ def process_page( channel, method, properties, body, **kwargs):
page = decoded_body['body'] page = decoded_body['body']
out_records = process_records( page, geonetwork_root_url, kwargs['working_directory'] ) out_records = process_records( page, geonetwork_root_url, kwargs['working_directory'], kwargs['credentials'] )
#print(json.dumps(out_records[0], indent=4)) #print(json.dumps(out_records[0], indent=4))
#dispatch #dispatch
...@@ -342,6 +342,7 @@ def main(cfg): ...@@ -342,6 +342,7 @@ def main(cfg):
channel.queue_bind(exchange=exchange, queue=docs_to_enrich_qn, routing_key=docs_to_enrich_rk) channel.queue_bind(exchange=exchange, queue=docs_to_enrich_qn, routing_key=docs_to_enrich_rk)
working_directory = cfg['session']['working_directory'] working_directory = cfg['session']['working_directory']
credentials = cfg['credentials']
#logging.info('Waiting for messages...') #logging.info('Waiting for messages...')
...@@ -351,7 +352,8 @@ def main(cfg): ...@@ -351,7 +352,8 @@ def main(cfg):
exchange=exchange, exchange=exchange,
docs_to_index_rk=docs_to_index_rk, docs_to_index_rk=docs_to_index_rk,
docs_to_enrich_rk=docs_to_enrich_rk, docs_to_enrich_rk=docs_to_enrich_rk,
working_directory=working_directory), working_directory=working_directory,
credentials=credentials),
queue=metadata_pages_to_process_qn)#, no_ack=True) queue=metadata_pages_to_process_qn)#, no_ack=True)
channel.start_consuming() channel.start_consuming()
......
...@@ -50,9 +50,10 @@ def get_entries_from_postgis( link, cfg, no_features_per_page=1000 ): ...@@ -50,9 +50,10 @@ def get_entries_from_postgis( link, cfg, no_features_per_page=1000 ):
return return
def get_wfs( link, offset=0, no_features_per_page=1000 ): def get_wfs( link, credentials, offset=0, no_features_per_page=1000 ):
root_url = link['url'] root_url = link['url']
print(offset, no_features_per_page)
params = {} params = {}
params['version'] = '2.0.0' params['version'] = '2.0.0'
...@@ -73,7 +74,19 @@ def get_wfs( link, offset=0, no_features_per_page=1000 ): ...@@ -73,7 +74,19 @@ def get_wfs( link, offset=0, no_features_per_page=1000 ):
params['startindex'] = offset #0 + cnt*no_features_per_page params['startindex'] = offset #0 + cnt*no_features_per_page
#params['to'] = params['from'] + no_records_per_page - 1 #params['to'] = params['from'] + no_records_per_page - 1
res = requests.get(root_url, params = params) with_credentials = False
for domain in credentials:
if domain in link['url'] and credentials[domain]['username'] != None and credentials[domain]['password']:
logging.info('Found a valid credential.')
with_credentials = True
username = credentials[domain]['username']
password = credentials[domain]['password']
break
if with_credentials:
res = requests.get(root_url, params = params, auth=(username, password))
else:
res = requests.get(root_url, params = params)
logging.debug(res.url) logging.debug(res.url)
...@@ -123,8 +136,7 @@ def old_enrich_docs( channel, method, properties, body, **kwargs ): ...@@ -123,8 +136,7 @@ def old_enrich_docs( channel, method, properties, body, **kwargs ):
logging.info('Enriching dataset named: %s' % decoded_body['body']['metadata-fr']['title']) logging.info('Enriching dataset named: %s' % decoded_body['body']['metadata-fr']['title'])
feature_page = get_wfs(wfs_info, offset, kwargs['features_per_page']) feature_page = get_wfs(wfs_info, kwargs['credentials'], offset, kwargs['features_per_page'])
#feature_page = get_entries_from_postgis(wfs_info, kwargs['postgis_cfg'])
# we implement pagination by letting this program creating tasks for itself / its siblings # we implement pagination by letting this program creating tasks for itself / its siblings
if feature_page != None and len(feature_page) == kwargs['features_per_page']: # at least another page is needed if feature_page != None and len(feature_page) == kwargs['features_per_page']: # at least another page is needed
......
...@@ -29,7 +29,7 @@ N.B.: Steps 6-12 can also be performed at the same time, in separate terminals. ...@@ -29,7 +29,7 @@ N.B.: Steps 6-12 can also be performed at the same time, in separate terminals.
## TODO ## TODO
* implementing the authenticated access to (meta)data sources; extracting a small sample of restricted access datasets out of the "full" documents, to be used as a teaser for the not-yet-authorized user * extracting a small sample of restricted access datasets out of the "full" documents, to be used as a teaser for the not-yet-authorized user
* incremental updates * incremental updates
* the field type detection takes a lot of time: can it be optimized? * the field type detection takes a lot of time: can it be optimized?
* logging, reporting * logging, reporting
......
...@@ -2,6 +2,8 @@ import re ...@@ -2,6 +2,8 @@ import re
from pprint import pprint from pprint import pprint
import json import json
import requests import requests
import urllib.request
import base64
from .my_logging import logging from .my_logging import logging
#import logging #import logging
...@@ -59,7 +61,7 @@ def protocol_to_formats_and_services( links ): ...@@ -59,7 +61,7 @@ def protocol_to_formats_and_services( links ):
return output return output
def fix_links( links ): def fix_links( links, credentials ):
fixed_links = links.copy() fixed_links = links.copy()
...@@ -81,9 +83,30 @@ def fix_links( links ): ...@@ -81,9 +83,30 @@ def fix_links( links ):
# FIX links in which the declared protocol is as bizarre as "WWW:LINK-1.0-http--link" # FIX links in which the declared protocol is as bizarre as "WWW:LINK-1.0-http--link"
# The KML protocol needs also to be fixed. # The KML protocol needs also to be fixed.
if 'protocol' in link.keys() and any( [x in link['protocol'] for x in ['WWW', 'kml', 'html', 'null'] ] ): if 'protocol' in link.keys() and any( [x in link['protocol'] for x in ['WWW', 'kml', 'html', 'null'] ] ):
print()
print(link['url'])
try: try:
# let's try getting the information from the Web Server... # let's try getting the information from the Web Server...
resp = requests.head( link['url'], allow_redirects=True ) with_credentials = False
for domain in credentials:
if domain in link['url'] and credentials[domain]['username'] != None and credentials[domain]['password']:
logging.info('Found a valid credential.')
with_credentials = True
username = credentials[domain]['username']
password = credentials[domain]['password']
break
if with_credentials:
# N.B.:
# when used with Basic Auth, the requests library strips the Content-Length header from the response;
# that's why we use urllib.request here...
req = urllib.request.Request( link['url'], method="HEAD" )
base64string = base64.b64encode(("%s:%s" % (username, password)).encode('ascii'))
req.add_header("Authorization", "Basic %s" % base64string.decode('ascii'))
resp = urllib.request.urlopen(req)
else:
resp = requests.head( link['url'], allow_redirects=True )
# the presence of the content-length assures that the Web Server knows what it is talking about, # the presence of the content-length assures that the Web Server knows what it is talking about,
# that is why we include the following line in this try-except block, except for HTML pages, in which case # that is why we include the following line in this try-except block, except for HTML pages, in which case
# webservers do not send back the content-length # webservers do not send back the content-length
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment