From 7d727aca8102b7fd2f4e78fd3b37ccb96f4ceb34 Mon Sep 17 00:00:00 2001 From: Fabien Forestier <fforestier@MacBookAir.local> Date: Fri, 26 Jun 2020 09:58:18 +0200 Subject: [PATCH] Fix bug that was triggering field catalog recreation in a loop when a field of a dataset had only null values --- README.md | 4 +++- tools/field_type_detector.py | 3 +++ workers/doc-processor.py | 8 +++++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0edc1d7..446952f 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,9 @@ The most "tedious" part of the workflow regards the heuristic detection of data Some "editorial metadata" are added to raw (meta)data before actually inserting documents into Elasticsearch (cf. the "doc-indexer" module). -A simplified overview of the entire workflow is provided by the attached [draw.io](https://www.draw.io) diagram. +Here is a simplified overview of the entire workflow. + + ## How-to diff --git a/tools/field_type_detector.py b/tools/field_type_detector.py index 6ba1692..9dea1e3 100644 --- a/tools/field_type_detector.py +++ b/tools/field_type_detector.py @@ -100,6 +100,9 @@ def elect_field_type( data ): found_types = set(analysis['types'].keys()) if found_types == set(['NoneType']): + # if no type has been found for a particular field, + # we are still adding the field to the catalog but with a None type + types[k].append(None) continue if 'NoneType' in found_types: diff --git a/workers/doc-processor.py b/workers/doc-processor.py index c53203d..9bf31b8 100644 --- a/workers/doc-processor.py +++ b/workers/doc-processor.py @@ -64,8 +64,14 @@ def fix_field_types( in_docs, out_types ): out_flattened_properties[prop] = convert_to_str(in_flattened_properties[prop]) elif out_types[lookup_key] == 'bool': out_flattened_properties[prop] = convert_to_boolean(in_flattened_properties[prop]) + elif not out_types[lookup_key]: + # If going through this step it means that a value has been found for that particular field + # so the type for that field shouldn't be null, in order to fix that we launch the recreation + # of the catalog + logging.debug('type %s found, recreating fields catalog', out_types[lookup_key]) + raise FieldTypeNotFound(lookup_key) else: - logging.critical('type %s not supported', out_types[prop]) + logging.critical('type %s not supported', out_types[lookup_key]) sys.exit(1) # pprint -- GitLab