Commit ef306763 authored by Alessandro CERIONI's avatar Alessandro CERIONI
Browse files

Selective (re)indexing.

parent e7ac9456
......@@ -10,9 +10,10 @@ api = Flask(__name__, static_url_path='')
def hello():
return render_template('index.html')
@api.route("/index/posts")
def index_posts():
task_url = main()
@api.route("/index/posts", defaults={'the_post_id': None})
@api.route("/index/posts/<string:the_post_id>")
def index_posts(the_post_id):
task_url = main(the_post_id)
return jsonify( {'reindex_task_url': task_url} )
......
......@@ -18,6 +18,23 @@ def get_token( root_url, username, password ):
return token
def get_post( cfg, the_post_id ):
root_url = cfg['content_getter']['wp_api']
username = cfg['content_getter']['wp_username']
password = cfg['content_getter']['wp_password']
token = get_token(root_url, username, password)
# https://developer.wordpress.org/rest-api/using-the-rest-api/pagination/
headers = {}
headers['Authorization'] = 'Bearer %s' % token
res = requests.get( root_url + '/wp/v2/posts/%s?_embed' % (the_post_id), headers=headers )
return res.json()
# print(dir(res))
def get_posts_by_page( cfg, no_posts_per_page=10 ):
root_url = cfg['content_getter']['wp_api']
......
......@@ -19,7 +19,35 @@ def index_docs( cfg, docs ):
},
"mappings": {
"_doc": {
"enabled": False
"dynamic_templates": [ # priority is given by order!
{
"uuid" : {
"path_match": "uuid",
# "mapping": {
# "type": "keyword",
# }
"mapping": {
"type": "text",
#"ignore_malformed": True
"index": False,
"fields": {
"keyword":
{
"type": "keyword"
}
}
}
}
},
{
"default" : {
"path_match": "*",
"mapping": {
"enabled": "false"
}
}
}
]
}
}
}
......@@ -33,7 +61,7 @@ def index_docs( cfg, docs ):
try:
# create index, in case it doesn't exist yet
rep = es.indices.create(es_index, es_body)
except:
except Exception as e:
pass
......@@ -51,8 +79,8 @@ def index_docs( cfg, docs ):
es_body = ''
for doc in docs:
try:
header['index']['_id'] = doc['id'] #hashlib.md5( json.dumps(doc, sort_keys=True).encode("utf-8") ).hexdigest()
del doc['id']
header['index']['_id'] = doc['uuid'] #hashlib.md5( json.dumps(doc, sort_keys=True).encode("utf-8") ).hexdigest()
#del doc['uuid']
except:
header['index']['_id'] = hashlib.md5( json.dumps(doc, sort_keys=True).encode("utf-8") ).hexdigest()
#print(header)
......
......@@ -14,25 +14,26 @@ def reindexer(cfg):
try:
rep = destination_es.indices.delete_template(cfg['reindexer']['template'])
#print(rep)
except:
except Exception as e:
pass
rep = destination_es.indices.put_template(cfg['reindexer']['template'], template)
# rep = es.indices.get_template("template_1")
# print(rep)
# t1 = time.time()
try:
rep = destination_es.indices.delete(cfg['reindexer']['destination_index'])
#print(rep)
except:
pass
# try:
# rep = destination_es.indices.delete(cfg['reindexer']['destination_index'])
# #print(rep)
# except:
# pass
try:
rep = destination_es.indices.create(cfg['reindexer']['destination_index'])
#print(rep)
except:
except Exception as e:
pass
body = {
......
from elasticsearch import Elasticsearch
from lib.content_getter import get_posts_by_page
from lib.content_getter import get_posts_by_page, get_post
from lib.content_processor import process_posts
from lib.indexer import index_docs
from lib.reindexer import reindexer
from lib.alias_setter import alias_setter
from lib.index_cleaner import index_cleaner
def main():
def main( post_id=None ):
import yaml
......@@ -16,33 +17,39 @@ def main():
print('Starting...')
index_cleaner(cfg, post_id)
print('Deleting source index...')
es = Elasticsearch([cfg['reindexer']['source_url']])
if post_id == None:
try:
res = es.indices.delete(cfg['reindexer']['source_index'])
except:
pass
print('Getting posts...')
pages = get_posts_by_page(cfg)
cnt = 1
for page in pages:
#print(page)
print('Getting posts...')
pages = get_posts_by_page(cfg)
cnt = 1
for page in pages:
#print(page)
print('Processing page no. %i...' % cnt)
processed_page = process_posts(page)
print('Processing page no. %i...' % cnt)
processed_page = process_posts(page)
successful = index_docs(cfg, processed_page)
successful = index_docs(cfg, processed_page)
if not successful:
print('Something went wrong. Exiting...')
exit(1)
#print(processed_page)
cnt += 1
else:
print('Getting post with uuid=%s...' % post_id)
post = get_post(cfg, post_id)
processed_post = process_posts([post])
successful = index_docs(cfg, processed_post)
if not successful:
print('Something went wrong. Exiting...')
exit(1)
#print(processed_page)
cnt += 1
print('Reindexing...')
task_url = reindexer(cfg)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment