diff --git a/README.md b/README.md index 0edc1d7f2c2009be37a8e8526c5af09be1be0998..446952f6aa4c24660932811dba9138fd586635bc 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,9 @@ The most "tedious" part of the workflow regards the heuristic detection of data Some "editorial metadata" are added to raw (meta)data before actually inserting documents into Elasticsearch (cf. the "doc-indexer" module). -A simplified overview of the entire workflow is provided by the attached [draw.io](https://www.draw.io) diagram. +Here is a simplified overview of the entire workflow. + + ## How-to diff --git a/tools/field_type_detector.py b/tools/field_type_detector.py index 6ba1692c3c8f59370c1f807b42550d5adf37e1c8..9dea1e3387a5512221fc7235d94c77e028b9520b 100644 --- a/tools/field_type_detector.py +++ b/tools/field_type_detector.py @@ -100,6 +100,9 @@ def elect_field_type( data ): found_types = set(analysis['types'].keys()) if found_types == set(['NoneType']): + # if no type has been found for a particular field, + # we are still adding the field to the catalog but with a None type + types[k].append(None) continue if 'NoneType' in found_types: diff --git a/workers/doc-processor.py b/workers/doc-processor.py index c53203d6875788cb322c402808593a9c56177b41..9bf31b82822dd6d12e65b943812af2a08f73e982 100644 --- a/workers/doc-processor.py +++ b/workers/doc-processor.py @@ -64,8 +64,14 @@ def fix_field_types( in_docs, out_types ): out_flattened_properties[prop] = convert_to_str(in_flattened_properties[prop]) elif out_types[lookup_key] == 'bool': out_flattened_properties[prop] = convert_to_boolean(in_flattened_properties[prop]) + elif not out_types[lookup_key]: + # If going through this step it means that a value has been found for that particular field + # so the type for that field shouldn't be null, in order to fix that we launch the recreation + # of the catalog + logging.debug('type %s found, recreating fields catalog', out_types[lookup_key]) + raise FieldTypeNotFound(lookup_key) else: - logging.critical('type %s not supported', out_types[prop]) + logging.critical('type %s not supported', out_types[lookup_key]) sys.exit(1) # pprint