Commit cab80eb9 authored by Alessandro CERIONI's avatar Alessandro CERIONI
Browse files

Indexing posts & pages.

parent 8af1593f
......@@ -4,7 +4,7 @@ import time
import json
import yaml
from main import index_everything, add_post, update_post, delete_post
from main import index_everything, add_content, update_content, delete_content
with open("config.yaml", 'r') as yamlfile:
cfg = yaml.load(yamlfile)
......@@ -21,38 +21,61 @@ def hello():
# task_url = main(the_post_id)
# return jsonify( {'reindex_task_url': task_url} )
# @api.route("/test", methods=['GET', 'POST'])
# def test():
# data = request.data # data is empty
# #print(dir(request.data))
# json_data = json.loads(data)
# with open('output/output.json', 'w') as fp:
# json.dump(json_data, fp)
# #print( json.dumps(json_data, indent=4) )
# # need posted data here
# return jsonify( {'status': 'OK'} )
@api.route("/test", methods=['GET', 'POST'])
def test():
data = request.data.decode('utf-8') # data is empty
#print(dir(request.data))
json_data = json.loads(data)
with open('output/output.json', 'w') as fp:
json.dump(json_data, fp)
#print( json.dumps(json_data, indent=4) )
# need posted data here
return jsonify( {'status': 'OK'} )
@api.route("/index/posts", methods=['GET'])
def _index_posts():
task_url = index_everything(cfg)
task_url = index_everything(cfg, 'posts')
return jsonify( {'reindex_task_url': task_url} )
@api.route("/index/pages", methods=['GET'])
def _index_pages():
task_url = index_everything(cfg, 'pages')
return jsonify( {'reindex_task_url': task_url} )
@api.route("/add/post", methods=['POST'])
def _add_post():
json_data = json.loads(request.data)
task_url = add_post(cfg, json_data)
json_data = json.loads(request.data.decode('utf-8'))
task_url = add_content(cfg, 'posts', json_data)
return jsonify( {'reindex_task_url': task_url} )
@api.route("/add/page", methods=['POST'])
def _add_page():
json_data = json.loads(request.data.decode('utf-8'))
task_url = add_content(cfg, 'pages', json_data)
return jsonify( {'reindex_task_url': task_url} )
@api.route("/update/post", methods=['POST'])
def _update_post():
json_data = json.loads(request.data)
task_url = update_post(cfg, json_data)
json_data = json.loads(request.data.decode('utf-8'))
task_url = update_content(cfg, 'posts', json_data)
return jsonify( {'reindex_task_url': task_url} )
@api.route("/update/page", methods=['POST'])
def _update_page():
json_data = json.loads(request.data.decode('utf-8'))
task_url = update_content(cfg, 'pages', json_data)
return jsonify( {'reindex_task_url': task_url} )
@api.route("/delete/post", methods=['POST'])
def _delete_post():
json_data = json.loads(request.data)
status = delete_post(cfg, json_data)
json_data = json.loads(request.data.decode('utf-8'))
status = delete_content(cfg, 'posts', json_data)
return jsonify( {'status': status} )
@api.route("/delete/page", methods=['POST'])
def _delete_page():
json_data = json.loads(request.data.decode('utf-8'))
status = delete_content(cfg, 'pages', json_data)
return jsonify( {'status': status} )
......
from elasticsearch import Elasticsearch
import json
def alias_setter(cfg):
def alias_setter(cfg, content_type):
es = Elasticsearch( [cfg['reindexer']['destination_url']] )
destination_index = cfg['reindexer']['destination_index'][content_type]
suffixes = ['published', 'draft'] #, 'private']
successful = True
for suffix in suffixes:
alias = "%s.%s" % (cfg['reindexer']['destination_index'], suffix)
alias = "%s.%s" % (destination_index, suffix)
the_body = { "filter" : {
"terms" : {
......@@ -24,7 +25,7 @@ def alias_setter(cfg):
except Exception as e:
print(e)
res = es.indices.put_alias(index=cfg['reindexer']['destination_index'],
res = es.indices.put_alias( index=destination_index,
name=alias,
body=json.dumps(the_body))
#print(res)
......
# -*- coding: UTF-8 -*-
template = {
#"index_patterns" : ["posts.v1"],
"order" : 0,
"settings" : {
"index.mapping.total_fields.limit": 10000,
#"index.mapping.ignore_malformed": True,
"number_of_shards" : 1,
"number_of_replicas" : 0,
"max_ngram_diff": 100,
"analysis": {
"filter": {
"my_ascii_folding" : {
"type" : "asciifolding",
"preserve_original" : False
},
"my_original_preserving_ascii_folding" : {
"type" : "asciifolding",
"preserve_original" : True
},
"french_elision": {
"type": "elision",
"articles_case": True,
"articles": [
"l", "m", "t", "qu", "n", "s",
"j", "d", "c", "jusqu", "quoiqu",
"lorsqu", "puisqu"
]
},
"french_stop": {
"type": "stop",
"stopwords": "_french_"
},
"preserving_word_delimiter": {
"type": "word_delimiter",
"preserve_original": "true"
},
"protect_keywords": {
"type": "keyword_marker",
"keywords": ["vélo'v"]
}
# "shingle": {
# "type": "shingle",
# "min_shingle_size": 2,
# "max_shingle_size": 3
# }
# "french_keywords": {
# "type": "keyword_marker",
# "keywords": ["Exemple"]
# },
# "french_stemmer": {
# "type": "stemmer",
# "language": "light_french"
# }
},
"tokenizer": {
"my_edge_ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 30,
"token_chars": [
"letter",
"digit"
]
}
},
"analyzer": {
"edge_ngram_analyzer_with_asciifolding": {
"type": "custom",
"tokenizer": "my_edge_ngram_tokenizer",
"filter": [
"lowercase",
"protect_keywords",
"my_original_preserving_ascii_folding",
"french_elision",
"french_stop"#,
#"preserving_word_delimiter"
]
},
"my_search_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"protect_keywords",
# "my_ascii_folding",
"french_elision",
"french_stop"#,
#"preserving_word_delimiter"
]
},
"my_suggest_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"protect_keywords",
# "my_ascii_folding",
"french_elision",
"french_stop"#,
#"preserving_word_delimiter"
]
},
# "trigram": {
# "type": "custom",
# "tokenizer": "standard",
# "filter": ["standard", "shingle"]
# },
# "reverse": {
# "type": "custom",
# "tokenizer": "standard",
# "filter": ["standard", "reverse"]
# }
}
}
},
"mappings" : {
"_doc" : {
"dynamic_templates": [ # priority is given by order!
{
"geoshape-template" : {
"match_pattern": "regex",
"path_match": "metadata-fr.bbox|data-fr.geometry",
"mapping": {
"type": "geo_shape",
"tree": "quadtree",
#"index": "false"
"ignore_malformed": True
}
}
},
{
"link-template" : {
"path_match": "metadata-fr.link",
"mapping": {
#"type": "nested",
"index": "false"
#"ignore_malformed": True
}
}
},
# {
# "exception1-template" : {
# "path_match": "data-fr.properties.date_photo",
# "mapping": {
# "type": "text"
# # "index": False,
# #"ignore_malformed": True
# }
# }
# },
{
"date-template" : {
"match_mapping_type": "date",
# "path_match": "data-fr\.properties\.date.+|data-fr\.properties\.last_update.*|metadata-fr\.creationDate",
"mapping": {
"type": "date",
"format": "strict_date_optional_time",#||epoch_millis||yyyy-MM-dd HH:mm:ss",
"fields": {
"sort": {
"type": "date"
}
}
# "index": False,
#"ignore_malformed": True
}
}
},
# {
# "unindexed-field-template": {
# "match_pattern": "regex",
# "match": "url|href",
# "mapping": {
# # "type": "keyword",
# "index": False,
# "ignore_malformed": True
# }
# }
# },
{
"unindexed-path-template": {
"match_pattern": "regex",
"match_mapping_type": "*",
"path_match": "metadata-fr\.href.*|metadata-fr\.idxMsg.*|data-fr\.geometry\..*|metadata-fr\.identifier.*|metadata-fr\.geonet\:info\.@xmlns:geonet|metadata-fr\.responsibleParty\.logo|metadata-fr\.image\..*|.*url|metadata-fr\.link\.name",
# "match": "(metadata-fr\.image.*|data-fr\.geometry.*|metadata-fr\.href.*|metadata-fr\.idxMsg.*)",
"mapping": {
"type": "text",
#"ignore_malformed": True
"index": False
}
}
},
{
"long-template": {
"match_mapping_type": "long",
"mapping": {
"type": "long",
"fields": {
"sort":
{
"type": "long"
}
}
}
}
},
{
"double-template": {
"match_mapping_type": "double",
"mapping": {
"type": "double",
"fields": {
"sort":
{
"type": "double"
}
}
}
}
},
{
"boolean-template": {
"match_mapping_type": "boolean",
"mapping": {
"type": "boolean",
"fields": {
"sort":
{
"type": "boolean"
}
}
}
}
},
# {
# "exception1-template": {
# "match_pattern": "regex",
# "match": "data-fr.properties.datemajgraph|data-fr.properties.date_creation", #test-q-plus-wfs.c1b069ca-181d-4265-9838-8d182f207bd3.ingest.v6
# "mapping": {
# "type": "date",
# "ignore_malformed": True
# }
# }
# },
# {
# "exception2-template": {
# "match_mapping_type": "long",
# # "match": "numero", #test-q-plus-wfs.c1b069ca-181d-4265-9838-8d182f207bd3.ingest.v6
# "mapping": {
# "type": "long",v9
# "ignore_malformed": True
# }
# }
# },
# {
# "exception3-template": {
# "match_pattern": "regex",
# "match": "data-fr\.properties\.address", #test-q-plus-wfs.c1b069ca-181d-4265-9838-8d182f207bd3.ingest.v6
# "mapping": {
# "type": "object",
# "ignore_malformed": True
# }
# }
# },
# {
# "exception4-template": {
# "match_mapping_type": "object",
# # "match": "numero", #test-q-plus-wfs.c1b069ca-181d-4265-9838-8d182f207bd3.ingest.v6
# "mapping": {
# "type": "object",
# "ignore_malformed": True
# }
# }
# },search_ana
# {
# "exception5-template": {
# "match_pattern": "regex",
# "match": "openinghoursspecification",
# # "match": "numero", #test-q-plus-wfs.c1b069ca-181d-4265-9838-8d182f207bd3.ingest.v6
# "mapping": {
# "type": "nested",
# "ignore_malformed": True
# }
# }
# },
{
"data-template": {
"match_pattern": "regex",
"path_match": "data-fr\.properties\..+",
"match_mapping_type": "string",
"mapping": {
"type": "text",
# "ignore_malformed": True,
"analyzer": "edge_ngram_analyzer_with_asciifolding",
# "search_analyzer": "standard",
"search_analyzer": "my_search_analyzer",
"term_vector": "with_positions_offsets",
"copy_to": ["data", "data_and_metadata"],
"index_options": "offsets",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
},
"sort": {
"type": "keyword"
},
"suggest": {
"type": "completion",
"analyzer": "simple"
#"preserve_position_increments":
}
}
}
}
},
{
"string-template": {
"match_mapping_type": "string",
"mapping": {
"type": "text",
# "ignore_malformed": True,
"analyzer": "edge_ngram_analyzer_with_asciifolding",
# "search_analyzer": "standard",
"search_analyzer": "my_search_analyzer",
"term_vector": "with_positions_offsets",
"copy_to": "data_and_metadata",
"index_options": "offsets",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256,
},
"sort": {
"type": "keyword"
},
"suggest": {
"type": "text",
"analyzer": "my_suggest_analyzer"
#"preserve_position_increments":
}
# "trigram": {
# "type": "text",
# "analyzer": "trigram"
# },
# "reverse": {
# "type": "text",
# "analyzer": "reverse"
# }
}
}
}
}
]
}
}
}
......@@ -13,7 +13,11 @@ def get_token( admin_key_key ):
return JWT
def get_post( cfg, the_post_id ):
def get_content( cfg, the_content_type, the_content_id ):
# the_content_type can be either posts or pages
if the_content_type not in ['posts', 'pages']:
raise Exception('Invalid content type.')
root_url = cfg['content_getter']['ghost_api']
admin_api_key = cfg['content_getter']['ghost_admin_api_key']
......@@ -28,17 +32,21 @@ def get_post( cfg, the_post_id ):
headers['Authorization'] = 'Ghost %s' % jwt
# getting 1st page, in order to also know how many pages we have to go for...
res = requests.get( root_url + '/v2/admin/posts/%s?include=authors,tags&formats=html,plaintext' % (the_post_id), headers=headers )
res = requests.get( root_url + '/v2/admin/%s/%s?include=authors,tags&formats=html,plaintext' % (the_content_type, the_content_id), headers=headers )
if res.status_code == 200:
return res.json()['posts'][0]
return res.json()[the_content_type][0]
else:
raise Exception('Post not found.')
# print(dir(res))
def get_posts_by_page( cfg, no_posts_per_page=10 ):
def get_all_content_by_page( cfg, the_content_type, no_elements_per_page=10 ):
# the_content_type can be either posts or pages
if the_content_type not in ['posts', 'pages']:
raise Exception('Invalid content type.')
root_url = cfg['content_getter']['ghost_api']
admin_api_key = cfg['content_getter']['ghost_admin_api_key']
......@@ -55,8 +63,9 @@ def get_posts_by_page( cfg, no_posts_per_page=10 ):
page_no = 1
# getting 1st page, in order to also know how many pages we have to go for...
res = requests.get( root_url + '/v2/admin/posts?include=authors,tags&formats=html,plaintext&limit=%i&page=%i' % (no_posts_per_page, page_no), headers=headers )
res = requests.get( root_url + '/v2/admin/%s?include=authors,tags&formats=html,plaintext&limit=%i&page=%i'
% (the_content_type, no_elements_per_page, page_no), headers=headers )
yield res.json()
pagination = res.json()['meta']['pagination']
......@@ -65,7 +74,8 @@ def get_posts_by_page( cfg, no_posts_per_page=10 ):
page_no += 1
while page_no <= pages :
res = requests.get( root_url + '/v2/admin/posts?include=authors,tags&formats=html,plaintext&limit=%i&page=%i' % (no_posts_per_page, page_no), headers=headers )
res = requests.get( root_url + '/v2/admin/%s?include=authors,tags&formats=html,plaintext&limit=%i&page=%i'
% (the_content_type, no_elements_per_page, page_no), headers=headers )
#print(page_no, res.json()['meta'])
page_no += 1
......
......@@ -12,6 +12,20 @@ def process_posts( posts ):
return output_posts
def process_pages( pages ):
output_pages = []
for page in pages:
output_page = page.copy()
output_pages.append(output_page)
# print output
return output_pages
if __name__ == '__main__':
......
from elasticsearch import Elasticsearch
def index_cleaner( cfg, post_id=None ):
def index_cleaner( cfg, content_type, content_id=None ):
source_es = Elasticsearch([cfg['reindexer']['source_url']])
destin_es = Elasticsearch([cfg['reindexer']['destination_url']])
source_index = cfg['reindexer']['source_index'][content_type]
destin_index = cfg['reindexer']['destination_index'][content_type]
# Cleaning up the ingest and digest indexes...
if post_id == None:
if content_id == None:
print('Deleting the ingest and digest indexes...')
successful = True
try:
res = source_es.indices.delete(cfg['reindexer']['source_index'])
res = source_es.indices.delete(source_index)
except Exception as e:
print(e)
successful = False
try:
rep = destin_es.indices.delete(cfg['reindexer']['destination_index'])
rep = destin_es.indices.delete(destin_index)
#print(rep)
except Exception as e:
print(e)
......@@ -27,21 +30,19 @@ def index_cleaner( cfg, post_id=None ):
else:
print('Trying to delete post with id = %s...' % post_id )
source_es = Elasticsearch([cfg['reindexer']['source_url']])
destin_es = Elasticsearch([cfg['reindexer']['destination_url']])
print('Trying to delete content with id = %s...' % content_id )
successful = True
try:
res = source_es.delete(index=cfg['reindexer']['source_index'], doc_type='_doc', id=post_id)
res = source_es.delete(index=source_index, doc_type='_doc', id=content_id)
except Exception as e:
print(e)
successful = False
pass
try:
res = source_es.delete(index=cfg['reindexer']['destination_index'], doc_type='_doc', id=