diff --git a/1-metadata-getter.py b/1-metadata-getter.py index 650e361c8ceed2bfd0fd5bc687b7ae0e9b095233..4cca31479209f104397d05d12518b4e4799dee6b 100644 --- a/1-metadata-getter.py +++ b/1-metadata-getter.py @@ -79,7 +79,7 @@ def send_page( the_session_id, the_geonetwork_root_url, the_dest_index, the_page msg = {'header': {'geonetwork_root_url': the_geonetwork_root_url, 'session_id': the_session_id, 'dest_index': the_dest_index}, 'body': the_page} - the_body = msgpack.packb(msg, use_bin_type=True) + the_body = msgpack.packb(msg, use_bin_type=False) the_channel.basic_publish( exchange=the_exchange, routing_key=the_routing_key, diff --git a/2-metadata-processor.py b/2-metadata-processor.py index 9b85f60557a2c7373076088371b53179888dcfaa..be1a776f7c243fa73a522f750b50fbf8a3a2e64e 100644 --- a/2-metadata-processor.py +++ b/2-metadata-processor.py @@ -6,11 +6,12 @@ import time from dateutil.parser import parse import hashlib import json -from utils.exit_gracefully import exit_gracefully import re +from utils.exit_gracefully import exit_gracefully from utils.my_logging import logging from utils.fix_links import fix_links from utils.enrich_links import enrich_links +from utils.generate_slug import generate_slug def list_to_dictlist( the_input, the_context=None ): @@ -142,10 +143,13 @@ def process_records( in_records, geonetwork_root_url, working_directory, credent #print( in_records[0].keys() ) - out_records = [] + #out_records = [] for in_record in in_records: + the_uuid = in_record['geonet:info']['uuid'] + logging.info("Processing record %s..." % the_uuid) + out_record = {} # all the content of the original record in "mounted" at "metadata-fr" out_record['metadata-fr'] = in_record.copy() @@ -264,10 +268,11 @@ def process_records( in_records, geonetwork_root_url, working_directory, credent #pprint(out_record) - out_records.append(out_record) + #out_records.append(out_record) #print('-'*80) + yield out_record - return out_records + #return out_records @@ -289,10 +294,6 @@ def process_page( channel, method, properties, body, **kwargs): #dispatch for metadata_record in out_records: - the_uuid = metadata_record['metadata-fr']['geonet:info']['uuid'] - logging.info("Processing record %s..." % the_uuid) - - # let's look for a WFS ressource to potentially fetch and index... wfs_found = False @@ -310,6 +311,10 @@ def process_page( channel, method, properties, body, **kwargs): full_version = metadata_record.copy() # including metadata AND data full_version['uuid'] = metadata_record['metadata-fr']['geonet:info']['uuid'] + '.full' full_version['type'] = metadata_record['metadata-fr']['type'] + the_full_title = metadata_record['metadata-fr']['title'] + the_slug = generate_slug(the_full_title) + logging.info('Slug for "%s": %s' % (the_full_title, the_slug)) + full_version['slug'] = the_slug msg = {'header': {'wfs_info': link, 'offset': 0, 'session_id': session_id, 'dest_index': dest_index}, 'body': full_version} @@ -326,6 +331,10 @@ def process_page( channel, method, properties, body, **kwargs): meta_version = metadata_record.copy() # including metadata ONLY meta_version['uuid'] = metadata_record['metadata-fr']['geonet:info']['uuid'] + '.meta' meta_version['type'] = metadata_record['metadata-fr']['type'] + the_full_title = metadata_record['metadata-fr']['title'] + the_slug = generate_slug(the_full_title) + logging.info('Slug for "%s": %s' % (the_full_title, the_slug)) + meta_version['slug'] = the_slug msg = {'header': { "index" : { "_index" : dest_index, "_type" : "_doc" } }, 'body': meta_version} the_body = msgpack.packb(msg, use_bin_type=True) diff --git a/es_template.py b/es_template.py index aed13472ca03049dbd9bc7e3ae9a839a6a6bd9d5..9424b8fa0208e3ed21c5020ec9e2800054f75935 100644 --- a/es_template.py +++ b/es_template.py @@ -126,7 +126,7 @@ template = { { "keyword-template" : { "match_pattern": "regex", - "path_match": ".*md5.*|metadata-fr\.link\.formats.*|metadata-fr\.link\.service.*|metadata-fr\.parentId.*", + "path_match": ".*md5.*|metadata-fr\.link\.formats.*|metadata-fr\.link\.service.*|metadata-fr\.parentId.*|slug", "mapping": { "type": "text", "index": False, diff --git a/requirements.txt b/requirements.txt index b7eae43e7a682f97eeccc121619bf53964968aa2..d15bb6c5aaa102e702f3886aedf30509e3767efe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,5 @@ elasticsearch>=6.0.0,<7.0.0 GeoAlchemy2==0.5.0 psycopg2-binary>=2.7.0 sqlalchemy>=1.2.0,<1.3.0 +python-slugify +nltk