From fcc5fe8278a825741b3991db0782ebbebaae1125 Mon Sep 17 00:00:00 2001
From: Alessandro Cerioni <acerioni@grandlyon.com>
Date: Sun, 10 Mar 2019 18:34:52 +0100
Subject: [PATCH] Removed some fields from metadata. Updated es_template in
 order to avoid indexing some fields as full-text.

---
 2-metadata-processor.py |  5 ++++-
 7-doc-indexer.py        |  3 +++
 es_template.py          | 41 +++++++++++++++++++++++++++++++++++------
 utils/fix_links.py      |  3 +++
 4 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/2-metadata-processor.py b/2-metadata-processor.py
index a9847db..f9442f8 100644
--- a/2-metadata-processor.py
+++ b/2-metadata-processor.py
@@ -208,7 +208,10 @@ def process_records( in_records, geonetwork_root_url, working_directory ):
 
 
         # let's delete some attributes which are very specific to GeoNetwork
-        attribs_to_delete = ['userinfo', 'isHarvested', 'isTemplate', 'owner', 'displayOrder', 'publishedForGroup', 'valid']
+        attribs_to_delete = ['userinfo', 'isHarvested', 'isTemplate', 'owner', \
+                                'displayOrder', 'publishedForGroup', 'valid', 'docLocale', \
+                                'popularity', 'mdLanguage', 'root', 'rating', 'source', \
+                                'defaultTitle', 'datasetLang', 'geoDesc', 'locale', 'logo']
 
         for attrib in attribs_to_delete:
             try:
diff --git a/7-doc-indexer.py b/7-doc-indexer.py
index 86ea635..b5cdbe1 100644
--- a/7-doc-indexer.py
+++ b/7-doc-indexer.py
@@ -68,6 +68,9 @@ def tag_doc( the_doc ):
             if any( [x in the_doc['data-fr']['geometry']['type'] for x in ['Polygon', 'MultiPolygon']] ):
                 tag_dict['isAreal'] = True
 
+    # isSample? docs that are tagged by this script are never just a sample
+    tag_dict['isSample'] = False
+
     tagged_doc = {'editorial-metadata': tag_dict, **the_doc}
 
     return tagged_doc
diff --git a/es_template.py b/es_template.py
index 1bbf8ff..cd28b90 100644
--- a/es_template.py
+++ b/es_template.py
@@ -113,12 +113,28 @@ template = {
                         }
                     }
                 },
+                # {
+                #     "link-template" : {
+                #         "path_match": "metadata-fr.link",
+                #         "mapping": {
+                #             #"type": "nested",
+                #             "index": "false"
+                #             #"ignore_malformed": True
+                #         }
+                #     }
+                # },
                 {
-                    "link-template" : {
-                        "path_match": "metadata-fr.link",
+                    "keyword-template" : {
+                        "match_pattern": "regex",
+                        "path_match": ".*md5.*|metadata-fr\.link\.formats.*|metadata-fr\.link\.service.*|metadata-fr\.parentId.*",
                         "mapping": {
-                            #"type": "nested",
-                            "index": "false"
+                            "type": "text",
+                            "index": False,
+                            "fields": {
+                                "keyword": {
+                                    "type": "keyword"
+                                }
+                            }
                             #"ignore_malformed": True
                         }
                     }
@@ -153,10 +169,23 @@ template = {
                     }
                 },
                 {
-                    "unindexed-path-template": {
+                    "unindexed-path-template-1": {
+                        "match_pattern": "regex",
+                        "match_mapping_type": "*",
+                        "path_match": "metadata-fr\.href.*|metadata-fr\.idxMsg.*|data-fr\.geometry\..*|metadata-fr\.identifier.*|metadata-fr\.geonet\:info.*:geonet|metadata-fr\.responsibleParty\.logo|metadata-fr\.image\..*|.*url|metadata-fr\.link\.name",
+                        # "match": "(metadata-fr\.image.*|data-fr\.geometry.*|metadata-fr\.href.*|metadata-fr\.idxMsg.*)",
+                        "mapping": {
+                            "type": "text",
+                            #"ignore_malformed": True
+                            "index": False
+                        }
+                    }
+                },
+                {
+                    "unindexed-path-template-2": {
                         "match_pattern": "regex",
                         "match_mapping_type": "*",
-                        "path_match": "metadata-fr\.href.*|metadata-fr\.idxMsg.*|data-fr\.geometry\..*|metadata-fr\.identifier.*|metadata-fr\.geonet\:info\.@xmlns:geonet|metadata-fr\.responsibleParty\.logo|metadata-fr\.image\..*|.*url|metadata-fr\.link\.name",
+                        "path_match": "metadata-fr\.link\.bbox_by_projection|metadata-fr\.link\.projections|metadata-fr\.link\.content-type|metadata-fr\.link\.content-length",
                         # "match": "(metadata-fr\.image.*|data-fr\.geometry.*|metadata-fr\.href.*|metadata-fr\.idxMsg.*)",
                         "mapping": {
                             "type": "text",
diff --git a/utils/fix_links.py b/utils/fix_links.py
index 2261dc9..f1a6765 100644
--- a/utils/fix_links.py
+++ b/utils/fix_links.py
@@ -66,6 +66,9 @@ def fix_links( links ):
     # the 'protocol' attribute is used, today in a rather meaningless way; let's try improving it... (WWW-LINK -> PDF, ...)
     for k, link in enumerate(links):
 
+        if 'unknown' in link.keys():
+            del fixed_links[k]['unknown']
+
         # the 'description' attribute ends, sometimes, with (OGC:WMS), (OGC:WCS), ..., which is redundant and, in some case, erroneus.
         if 'description' in link.keys():
             fixed_links[k]['description'] = re.sub(r'\(OGC:WMS\)|\(OGC:WCS\)|\(OGC:WFS\)|\(OGC:SOS\)', '', link['description']).strip()
-- 
GitLab