From 4bbb977adeb4a3ff68a0a016220227aac92f88ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20DA=20ROCHA?= Date: Fri, 4 Dec 2020 11:39:18 +0100 Subject: [PATCH 1/3] Fix data sort --- lib/elasticsearch_template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/elasticsearch_template.py b/lib/elasticsearch_template.py index ed4fa9f..ecb67fb 100644 --- a/lib/elasticsearch_template.py +++ b/lib/elasticsearch_template.py @@ -53,7 +53,7 @@ template = { "analyzer": { "my_sort_analyzer": { "type": "custom", - "tokenizer": "standard", + "tokenizer": "keyword", "filter": [ "original_preserving_ascii_folding" ] -- GitLab From 09725259fe4c3b10c7684e7c02896abac7397ba2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20DA=20ROCHA?= Date: Fri, 4 Dec 2020 11:40:38 +0100 Subject: [PATCH 2/3] Add debug output --- lib/postgis_helper.py | 1 + tools/field_type_detector.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/postgis_helper.py b/lib/postgis_helper.py index 85fd008..0465027 100644 --- a/lib/postgis_helper.py +++ b/lib/postgis_helper.py @@ -203,6 +203,7 @@ def main(**kwargs): schema_names = conn.get_schema_names() for schema in schema_names: if schema_name and not schema_name == schema: + logging.debug(f"not {schema_name}, sckipping {schema}") continue for table in conn.get_tables(schema=schema): if table_name and not table_name == table.name: diff --git a/tools/field_type_detector.py b/tools/field_type_detector.py index 78d92fe..91a7863 100644 --- a/tools/field_type_detector.py +++ b/tools/field_type_detector.py @@ -151,12 +151,12 @@ def generate_field_catalog( cfg ): logging.info('Getting schemas...') schema_names = pg.get_schema_names() - logging.info('Done.') + logging.info('Done: %s', schema_names) for schema_name in schema_names: if schema_whitelist is not None: if schema_name not in schema_whitelist: - logging.debug('Skipping schema %s' % schema_name) + logging.debug('Skipping schema %s (not in whitelist M%s)', schema_name, schema_whitelist) continue for table in pg.get_tables(schema_name): if schema_dot_table_whitelist is not None: @@ -245,7 +245,9 @@ def main(cfg): elected_field_types = elect_field_type( field_catalog_by_field ) with open(filename2, 'w') as fp: + logging.debug("writing %s", filename2) json.dump(elected_field_types, fp, sort_keys=True) + logging.debug(json.dumps(elected_field_types) ) final_field_catalog_by_dbschematable = field_catalog_by_dbschematable.copy() @@ -254,7 +256,10 @@ def main(cfg): final_field_catalog_by_dbschematable[db_schema_table]['types'][field] = elected_field_types[field] with open(filename3, 'w') as fp: + logging.debug("writing %s", filename3) json.dump(final_field_catalog_by_dbschematable, fp, sort_keys=True) + from pprint import pformat + logging.debug(pformat(json.dumps(final_field_catalog_by_dbschematable) )) return -- GitLab From c7cc6c6c48dc5b9d1cbcb9abdb287aa7e63832e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20DA=20ROCHA?= Date: Fri, 4 Dec 2020 11:40:53 +0100 Subject: [PATCH 3/3] allow sample generation if no source_url specified (reindex on same host) --- workers/sample_generator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/workers/sample_generator.py b/workers/sample_generator.py index d86d5f3..82dd4e9 100644 --- a/workers/sample_generator.py +++ b/workers/sample_generator.py @@ -30,7 +30,10 @@ def callback(channel, method, properties, body): #data = res.json() # get sample records from the ingest index - source_es = Elasticsearch([cfg['reindexer']['source_url']], timeout=60) + if 'source_url' in cfg['reindexer'].keys(): + source_es = Elasticsearch([cfg['reindexer']['source_url']], timeout=60) + else: + source_es = Elasticsearch([cfg['reindexer']['destination_url']], timeout=60) the_query = dict() the_query['size'] = sample_size -- GitLab