diff --git a/2-metadata-processor.py b/2-metadata-processor.py index a9847db696eb41f8480ae32a2713c04af050d1a1..f9442f8ca9ba064d7f55c89fde4353547005d2cb 100644 --- a/2-metadata-processor.py +++ b/2-metadata-processor.py @@ -208,7 +208,10 @@ def process_records( in_records, geonetwork_root_url, working_directory ): # let's delete some attributes which are very specific to GeoNetwork - attribs_to_delete = ['userinfo', 'isHarvested', 'isTemplate', 'owner', 'displayOrder', 'publishedForGroup', 'valid'] + attribs_to_delete = ['userinfo', 'isHarvested', 'isTemplate', 'owner', \ + 'displayOrder', 'publishedForGroup', 'valid', 'docLocale', \ + 'popularity', 'mdLanguage', 'root', 'rating', 'source', \ + 'defaultTitle', 'datasetLang', 'geoDesc', 'locale', 'logo'] for attrib in attribs_to_delete: try: diff --git a/7-doc-indexer.py b/7-doc-indexer.py index 86ea6353c22b22f035fc4ddea017d30eb108c4fa..b5cdbe15df5dece9d36b0ece9d9f89a3c1f5944f 100644 --- a/7-doc-indexer.py +++ b/7-doc-indexer.py @@ -68,6 +68,9 @@ def tag_doc( the_doc ): if any( [x in the_doc['data-fr']['geometry']['type'] for x in ['Polygon', 'MultiPolygon']] ): tag_dict['isAreal'] = True + # isSample? docs that are tagged by this script are never just a sample + tag_dict['isSample'] = False + tagged_doc = {'editorial-metadata': tag_dict, **the_doc} return tagged_doc diff --git a/es_template.py b/es_template.py index 1bbf8ffb3ab6e8d5b66b61bc74ca0403b9b78d98..cd28b9033d80239ff7b97edee38673dab13f67b4 100644 --- a/es_template.py +++ b/es_template.py @@ -113,12 +113,28 @@ template = { } } }, + # { + # "link-template" : { + # "path_match": "metadata-fr.link", + # "mapping": { + # #"type": "nested", + # "index": "false" + # #"ignore_malformed": True + # } + # } + # }, { - "link-template" : { - "path_match": "metadata-fr.link", + "keyword-template" : { + "match_pattern": "regex", + "path_match": ".*md5.*|metadata-fr\.link\.formats.*|metadata-fr\.link\.service.*|metadata-fr\.parentId.*", "mapping": { - #"type": "nested", - "index": "false" + "type": "text", + "index": False, + "fields": { + "keyword": { + "type": "keyword" + } + } #"ignore_malformed": True } } @@ -153,10 +169,23 @@ template = { } }, { - "unindexed-path-template": { + "unindexed-path-template-1": { + "match_pattern": "regex", + "match_mapping_type": "*", + "path_match": "metadata-fr\.href.*|metadata-fr\.idxMsg.*|data-fr\.geometry\..*|metadata-fr\.identifier.*|metadata-fr\.geonet\:info.*:geonet|metadata-fr\.responsibleParty\.logo|metadata-fr\.image\..*|.*url|metadata-fr\.link\.name", + # "match": "(metadata-fr\.image.*|data-fr\.geometry.*|metadata-fr\.href.*|metadata-fr\.idxMsg.*)", + "mapping": { + "type": "text", + #"ignore_malformed": True + "index": False + } + } + }, + { + "unindexed-path-template-2": { "match_pattern": "regex", "match_mapping_type": "*", - "path_match": "metadata-fr\.href.*|metadata-fr\.idxMsg.*|data-fr\.geometry\..*|metadata-fr\.identifier.*|metadata-fr\.geonet\:info\.@xmlns:geonet|metadata-fr\.responsibleParty\.logo|metadata-fr\.image\..*|.*url|metadata-fr\.link\.name", + "path_match": "metadata-fr\.link\.bbox_by_projection|metadata-fr\.link\.projections|metadata-fr\.link\.content-type|metadata-fr\.link\.content-length", # "match": "(metadata-fr\.image.*|data-fr\.geometry.*|metadata-fr\.href.*|metadata-fr\.idxMsg.*)", "mapping": { "type": "text", diff --git a/utils/fix_links.py b/utils/fix_links.py index 2261dc99fa0b3ff6d28e15bb1d2c1523604d0548..f1a6765d9d7a17351112087b3179ab4efa621e7a 100644 --- a/utils/fix_links.py +++ b/utils/fix_links.py @@ -66,6 +66,9 @@ def fix_links( links ): # the 'protocol' attribute is used, today in a rather meaningless way; let's try improving it... (WWW-LINK -> PDF, ...) for k, link in enumerate(links): + if 'unknown' in link.keys(): + del fixed_links[k]['unknown'] + # the 'description' attribute ends, sometimes, with (OGC:WMS), (OGC:WCS), ..., which is redundant and, in some case, erroneus. if 'description' in link.keys(): fixed_links[k]['description'] = re.sub(r'\(OGC:WMS\)|\(OGC:WCS\)|\(OGC:WFS\)|\(OGC:SOS\)', '', link['description']).strip()