Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
metadata-and-data
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
web-et-numerique
web-et-numerique-internet
data.grandlyon.com
web-portal
components
indexers
metadata-and-data
Commits
3e2e8a01
Commit
3e2e8a01
authored
4 years ago
by
Sébastien DA ROCHA
Browse files
Options
Downloads
Patches
Plain Diff
remove dead code
parent
a6d7399d
Branches
Branches containing commit
Tags
Tags containing commit
2 merge requests
!11
1.2.0
,
!7
9436 typage dans nom attr
Pipeline
#8193
passed
4 years ago
Stage: sonar-analysis
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
tools/field_type_detector.py
+0
-59
0 additions, 59 deletions
tools/field_type_detector.py
workers/doc-enricher.py
+2
-90
2 additions, 90 deletions
workers/doc-enricher.py
workers/doc-processor.py
+3
-5
3 additions, 5 deletions
workers/doc-processor.py
with
5 additions
and
154 deletions
tools/field_type_detector.py
+
0
−
59
View file @
3e2e8a01
...
...
@@ -20,65 +20,6 @@ class NotEmptyQueueException(Exception):
pass
def
old_generate_field_catalog
(
cfg
,
mongo_collection
):
# with open("config.yaml", 'r') as yamlfile:
# cfg = yaml.load(yamlfile)
# read from MongoDB
docs
=
mongo_collection
.
find
()
total
=
mongo_collection
.
count_documents
({})
#cnt = 0
types_by_dbschematable_by_field
=
{}
types_by_dbschematable_by_field
[
'
fields
'
]
=
{}
types_by_dbschematable_by_field
[
'
analyzed_docs
'
]
=
0
#for doc in tqdm(docs, total=total, file=sys.stdout):
for
doc
in
docs
:
properties
=
doc
[
'
data-fr
'
][
'
properties
'
]
dataset_uuid
=
doc
[
'
metadata-fr
'
][
'
geonet:info
'
][
'
uuid
'
]
dataset_title
=
doc
[
'
metadata-fr
'
][
'
title
'
]
flattened_properties
=
flatten_json
(
properties
)
#logging.INFO("Analyzing dataset %s %s" % (dataset_uuid, dataset_title))
#logging.debug('there')
# ---------------------------------------------------------------------------------------------
for
k
,
v
in
flattened_properties
.
items
():
# generate equivalent element kk out of k (in the sense of equivalence classes, cf. https://en.wikipedia.org/wiki/Equivalence_class)
# ex.: openinghoursspecification.XX.validThrough -> openinghoursspecification.0.validThrough
# N.B.: integers like .0, .1, .2, ... are inserted by the flatten_json function whenever a list is found...
kk
=
re
.
sub
(
r
'
\.\d+
'
,
'
.0
'
,
k
)
if
kk
not
in
types_by_dbschematable_by_field
[
'
fields
'
].
keys
():
types_by_dbschematable_by_field
[
'
fields
'
][
kk
]
=
{}
if
dataset_uuid
not
in
types_by_dbschematable_by_field
[
'
fields
'
][
kk
].
keys
():
types_by_dbschematable_by_field
[
'
fields
'
][
kk
][
dataset_uuid
]
=
{
'
title
'
:
dataset_title
,
'
types
'
:
[]}
type_set
=
set
(
types_by_dbschematable_by_field
[
'
fields
'
][
kk
][
dataset_uuid
][
'
types
'
]
)
type_set
.
add
(
detect_type
(
v
)
)
types_by_dbschematable_by_field
[
'
fields
'
][
kk
][
dataset_uuid
][
'
types
'
]
=
list
(
type_set
)
types_by_dbschematable_by_field
[
'
analyzed_docs
'
]
+=
1
logging
.
info
(
"
%s/%s
"
%
(
types_by_dbschematable_by_field
[
'
analyzed_docs
'
],
total
)
)
# useful for debugging:
if
cfg
[
'
field_type_detector
'
][
'
debug
'
]
and
types_by_dbschematable_by_field
[
'
analyzed_docs
'
]
>
1000
:
break
return
types_by_dbschematable_by_field
def
elect_field_type
(
data
):
logging
.
info
(
"
Electing field types...
"
)
...
...
This diff is collapsed.
Click to expand it.
workers/doc-enricher.py
+
2
−
90
View file @
3e2e8a01
...
...
@@ -54,6 +54,8 @@ def get_entries_from_postgis( link, cfg, no_features_per_page=1000 ):
feature_page
=
[]
# we accumulate entries in this sort of buffer
cnt
=
0
for
entry
in
pg
.
get_entries
(
table
):
# TODO Ajouter le type dans les colonnes par ici
feature_page
.
append
(
entry
)
if
len
(
feature_page
)
==
no_features_per_page
:
cnt
+=
1
...
...
@@ -84,13 +86,11 @@ def get_wfs( link, credentials, offset=0, no_features_per_page=1000 ):
#params['startindex'] = 11
params
[
'
SRSNAME
'
]
=
'
epsg:4326
'
#startindex = 0
cnt
=
offset
/
no_features_per_page
+
1
logging
.
info
(
'
WFS page %i; offset = %i
'
%
(
cnt
,
offset
))
params
[
'
startindex
'
]
=
offset
#0 + cnt*no_features_per_page
#params['to'] = params['from'] + no_records_per_page - 1
with_credentials
=
False
for
domain
in
credentials
:
...
...
@@ -109,104 +109,16 @@ def get_wfs( link, credentials, offset=0, no_features_per_page=1000 ):
logging
.
debug
(
res
.
url
)
try
:
# print(res.status_code)
# print(res.text)
# print(res.json())
features
=
res
.
json
()[
'
features
'
]
#processed_features = process_features(features)
logging
.
debug
(
len
(
features
))
return
features
# if len(features) < no_features_per_page:
# break # it means that we have reached the last page
#
# cnt += 1
except
:
#json.decoder.JSONDecodeError: # it means that we the WFS request failed, for instance because of insufficient right access
#logging.error("Failed WFS request: %s" % res.url)
logging
.
error
(
"
Failed WFS request: %s
"
%
res
.
url
)
#yield None
#raise Exception("Failed WFS request: %s" % res.url)
return
None
#print()
def
old_enrich_docs
(
channel
,
method
,
properties
,
body
,
**
kwargs
):
decoded_body
=
msgpack
.
unpackb
(
body
,
raw
=
False
)
wfs_info
=
decoded_body
[
'
header
'
][
'
wfs_info
'
]
offset
=
decoded_body
[
'
header
'
][
'
offset
'
]
session_id
=
decoded_body
[
'
header
'
][
'
session_id
'
]
dest_index
=
decoded_body
[
'
header
'
][
'
dest_index
'
]
logging
.
info
(
'
Enriching dataset named: %s
'
%
decoded_body
[
'
body
'
][
'
metadata-fr
'
][
'
title
'
])
feature_page
=
get_wfs
(
wfs_info
,
kwargs
[
'
credentials
'
],
offset
,
kwargs
[
'
features_per_page
'
])
# we implement pagination by letting this program creating tasks for itself / its siblings
if
feature_page
!=
None
and
len
(
feature_page
)
==
kwargs
[
'
features_per_page
'
]:
# at least another page is needed
msg
=
{
'
header
'
:
{
'
wfs_info
'
:
wfs_info
,
'
offset
'
:
offset
+
kwargs
[
'
features_per_page
'
],
'
session_id
'
:
session_id
,
'
dest_index
'
:
dest_index
},
'
body
'
:
decoded_body
[
'
body
'
]}
the_body
=
msgpack
.
packb
(
msg
,
use_bin_type
=
True
)
channel
.
basic_publish
(
exchange
=
kwargs
[
'
exchange
'
],
routing_key
=
kwargs
[
'
docs_to_enrich_rk
'
],
body
=
the_body
,
properties
=
pika
.
BasicProperties
(
delivery_mode
=
2
)
)
if
feature_page
!=
None
:
# try:
# #for feature_page in feature_pages:
# #print(feature_page[0]['properties']['nom_reduit'])
logging
.
info
(
'
Sending feature page of len = %i to RabbitMQ and MongoDB...
'
%
len
(
feature_page
))
doc_page
=
[{
**
decoded_body
[
'
body
'
],
'
data-fr
'
:
feature
}
for
feature
in
feature_page
]
msg
=
{
'
header
'
:
{
'
metadata
'
:
decoded_body
[
'
body
'
],
'
session_id
'
:
session_id
,
'
dest_index
'
:
dest_index
},
'
body
'
:
doc_page
}
the_body
=
msgpack
.
packb
(
msg
,
use_bin_type
=
True
)
channel
.
basic_publish
(
exchange
=
kwargs
[
'
exchange
'
],
routing_key
=
kwargs
[
'
doc_pages_to_store_in_mongo_rk
'
],
body
=
the_body
,
properties
=
pika
.
BasicProperties
(
delivery_mode
=
2
)
)
channel
.
basic_publish
(
exchange
=
kwargs
[
'
exchange
'
],
routing_key
=
kwargs
[
'
doc_pages_to_process_rk
'
],
body
=
the_body
,
properties
=
pika
.
BasicProperties
(
delivery_mode
=
2
)
)
logging
.
info
(
'
...done!
'
)
# except TypeError: # it means that getWFS returned None
# pass
# #
# #channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
channel
.
basic_ack
(
delivery_tag
=
method
.
delivery_tag
)
#channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
return
#out_docs
def
enrich_docs
(
channel
,
method
,
properties
,
body
):
...
...
This diff is collapsed.
Click to expand it.
workers/doc-processor.py
+
3
−
5
View file @
3e2e8a01
...
...
@@ -27,13 +27,10 @@ def fix_field_types( in_docs, out_types ):
for
in_doc
in
in_docs
:
# metadata = in_doc['metadata-fr']
# data = in_doc['data-fr']
if
'
data-fr
'
not
in
in_doc
:
out_docs
.
append
(
in_doc
)
continue
#
in_flattened_properties
=
flatten_json
(
in_doc
[
'
data-fr
'
][
'
properties
'
])
out_flattened_properties
=
in_flattened_properties
.
copy
()
...
...
@@ -74,10 +71,11 @@ def fix_field_types( in_docs, out_types ):
logging
.
critical
(
'
type %s not supported
'
,
out_types
[
lookup_key
])
sys
.
exit
(
1
)
# pprint
out_doc
=
in_doc
.
copy
()
out_doc
[
'
data-fr
'
][
'
properties
'
]
=
unflatten_json
(
out_flattened_properties
)
# amending addresses which are not in the schema.org format; we use out_doc in order to avoid reintroducing null values!
if
'
address
'
in
out_doc
[
'
data-fr
'
][
'
properties
'
].
keys
()
and
type
(
out_doc
[
'
data-fr
'
][
'
properties
'
][
'
address
'
])
is
str
:
the_street_address
=
in_doc
[
'
data-fr
'
][
'
properties
'
][
'
address
'
]
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment