From 7d727aca8102b7fd2f4e78fd3b37ccb96f4ceb34 Mon Sep 17 00:00:00 2001
From: Fabien Forestier <fforestier@MacBookAir.local>
Date: Fri, 26 Jun 2020 09:58:18 +0200
Subject: [PATCH] Fix bug that was triggering field catalog recreation in a
 loop when a field of a dataset had only null values

---
 README.md                    | 4 +++-
 tools/field_type_detector.py | 3 +++
 workers/doc-processor.py     | 8 +++++++-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0edc1d7..446952f 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,9 @@ The most "tedious" part of the workflow regards the heuristic detection of data
 
 Some "editorial metadata" are added to raw (meta)data before actually inserting documents into Elasticsearch (cf. the "doc-indexer" module).
 
-A simplified overview of the entire workflow is provided by the attached [draw.io](https://www.draw.io) diagram.
+Here is a simplified overview of the entire workflow.
+
+![Indexer workflow diagram](./doc/data-grandlyon-com-indexer-workflow-drawio.png)
 
 ## How-to
 
diff --git a/tools/field_type_detector.py b/tools/field_type_detector.py
index 6ba1692..9dea1e3 100644
--- a/tools/field_type_detector.py
+++ b/tools/field_type_detector.py
@@ -100,6 +100,9 @@ def elect_field_type( data ):
             found_types = set(analysis['types'].keys())
 
             if found_types == set(['NoneType']):
+                # if no type has been found for a particular field,
+                # we are still adding the field to the catalog but with a None type
+                types[k].append(None)
                 continue
 
             if 'NoneType' in found_types:
diff --git a/workers/doc-processor.py b/workers/doc-processor.py
index c53203d..9bf31b8 100644
--- a/workers/doc-processor.py
+++ b/workers/doc-processor.py
@@ -64,8 +64,14 @@ def fix_field_types( in_docs, out_types ):
                 out_flattened_properties[prop] = convert_to_str(in_flattened_properties[prop])
             elif out_types[lookup_key] == 'bool':
                 out_flattened_properties[prop] = convert_to_boolean(in_flattened_properties[prop])
+            elif not out_types[lookup_key]:
+                # If going through this step it means that a value has been found for that particular field
+                # so the type for that field shouldn't be null, in order to fix that we launch the recreation
+                # of the catalog
+                logging.debug('type %s found, recreating fields catalog', out_types[lookup_key])
+                raise FieldTypeNotFound(lookup_key)
             else:
-                logging.critical('type %s not supported', out_types[prop])
+                logging.critical('type %s not supported', out_types[lookup_key])
                 sys.exit(1)
 
         # pprint
-- 
GitLab