Skip to content
Snippets Groups Projects
Commit 77f6cb6c authored by ddamiron's avatar ddamiron
Browse files

update reindexer with publish logs to mongo

update sampler with publish logs to mongo
parent 77ace0ac
No related branches found
No related tags found
No related merge requests found
import time
import json
import msgpack
import pika
import os, sys
from elasticsearch import Elasticsearch, NotFoundError
fileDir = os.path.dirname(os.path.abspath(__file__))
parentDir = os.path.dirname(fileDir)
newPath = os.path.join(parentDir)
sys.path.append(newPath)
from lib.my_logging import logging
from lib.exit_gracefully import exit_gracefully
from lib.locker import unlock
from lib.rabbit_session import RabbitSession
from lib.log_message import LogMessage
class NotEmptyQueueException(Exception):
pass
class Reindexer :
def __init__(self, cfg):
self.cfg = cfg
self.rabbit = None
def create_sampling_task(self, cfg, channel, uuid):
# here-below we generate a task for the sample generator (full -> meta)
msg = dict()
msg['header'] = dict()
msg['header']['cfg'] = cfg
#msg['header']['reindex_task_url'] = reindex_task_url
msg['body'] = uuid
the_task_body = msgpack.packb(msg, use_bin_type=True)
exchange = cfg['rabbitmq']['exchange']
queue_name = cfg['rabbitmq']['queue_name_6']
routing_key = cfg['rabbitmq']['routing_key_6']
# ------------------------send task-----------------------------------
self.rabbit.publish_task(the_body=the_task_body,
exchange=exchange,
routing_key=routing_key,
queue_name=queue_name)
# ---------------------------------------------------------------------
return
def on_msg_callback(self, channel, method, properties, body):
decoded_body = msgpack.unpackb(body, raw=False)
# ---------------------- send log ----------------------------
log_message = LogMessage(session_id=decoded_body['header']['cfg']['session']['id'],
# session_id=cfg['session']['id'],
uuid=decoded_body['header']['cfg']['session']['current_uuid'],
step='reindexer',
status='Starting...',
uuid_prefix='meta',
info='no info'
)
self.rabbit.publish_log(log_message=log_message.__dict__)
# ------------------------------------------------------------
cfg = decoded_body['header']['cfg']
uuid = decoded_body['body']
count_ref = decoded_body['header']['count']
if 'source_url' in cfg['reindexer'].keys():
es_source = Elasticsearch([cfg['reindexer']['source_url']], timeout=60)
else:
es_source = Elasticsearch([cfg['reindexer']['destination_url']], timeout=60)
the_query = dict()
the_query['query'] = dict()
the_query['query']['term'] = {'uuid.keyword': '{0}'.format(uuid)}
es_source.indices.refresh(index=cfg['reindexer']['source_index'])
count_es = es_source.count(cfg['reindexer']['source_index'], body=the_query).get('count')
if count_es != count_ref:
logging.warning('Documents are still being pushed to the source index for dataset with uuid = %s' % uuid)
logging.debug('count_es = %i; count_ref = %i' % (count_es, count_ref))
time.sleep(5)
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
return
# 1. remove already existing docs from destination index
logging.info("Removing dataset with uuid = %s from the destination index..." % uuid)
es = Elasticsearch([cfg['reindexer']['destination_url']], timeout=60)
index = cfg['reindexer']['destination_index']
try:
es.indices.refresh(index=index)
except NotFoundError:
# the destination index may not be already present
pass
the_query = dict()
the_query['query'] = dict()
the_query['query']['term'] = {'uuid.keyword': '{0}'.format(uuid)}
try:
res = es.delete_by_query(index, doc_type='_doc', body=the_query)
logging.debug(res)
res = es.indices.refresh(index=index)
logging.debug(res)
except NotFoundError:
pass
except Exception as e:
logging.error(e)
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
return
# 3. trigger reindexation
body = {
"source": {
"index": cfg['reindexer']['source_index'],
"query": {
"term": {"uuid.keyword": '{0}'.format(uuid)}
},
"type": "_doc",
"size": 1000
},
"dest": {
"index": cfg['reindexer']['destination_index'],
"type": "_doc"
}
}
if 'source_url' in cfg['reindexer'].keys():
body['source']['remote'] = {'host': cfg['reindexer']['source_url']}
rep = es.reindex(body, wait_for_completion=False)
logging.debug(rep)
if 'task' in rep:
channel.basic_ack(delivery_tag=method.delivery_tag)
#print("")
reindex_task_url = "{0}/_tasks/{1}".format(cfg['reindexer']['destination_url'], rep['task'])
logging.info("Created reindex task: {0}".format(reindex_task_url))
# 3. create sampling task (full -> meta)
if '.full' in uuid:
self.create_sampling_task(cfg, channel, uuid)#, reindex_task_url)
logging.info("Created sampling task.")
# otherwise, remove the lock
else:
logging.info("Removing lock for dataset with uuid = %s." % uuid.replace('.meta', ''))
unlock( cfg['session']['working_directory'], uuid.replace('.meta', ''))
else:
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
#print("")
logging.error(json.dumps(rep, indent=4))
logging.error("Failed")
# ---------------------- send log ----------------------------
log_message = LogMessage(session_id=decoded_body['header']['cfg']['session']['id'],
# session_id=cfg['session']['id'],
uuid=decoded_body['header']['cfg']['session']['current_uuid'],
step='reindexer',
status='Terminated',
uuid_prefix='meta',
info='no info'
)
self.rabbit.publish_log(log_message=log_message.__dict__)
# ------------------------------------------------------------
return
def main(self):
with RabbitSession(self.cfg) as self.rabbit:
# ------------------------------------------------------------
docs_to_enrich_qn = cfg['rabbitmq_queue']
self.rabbit.consume_queue_and_launch_specific_method(specific_method=self.on_msg_callback,
specific_queue=docs_to_enrich_qn)
return
if __name__ == '__main__':
import yaml
import time
import signal
import argparse
signal.signal(signal.SIGINT, exit_gracefully)
parser = argparse.ArgumentParser(description='Incremental reindexer')
parser.add_argument('--host', dest='host', help='the RabbitMQ host', type=str, required=True)
parser.add_argument('--port', dest='port', help='the RabbitMQ port', type=int, default=5672)
parser.add_argument('--exchange', dest='exchange', help='the RabbitMQ exchange', type=str, required=True)
parser.add_argument('--queue', dest='queue', help='the RabbitMQ queue', type=str, required=True)
parser.add_argument('--loglevel', dest='loglevel', help='the log level', default="INFO", type=str, choices=['INFO', 'DEBUG', 'WARN', 'CRITICAL', 'ERROR'])
args = parser.parse_args()
cfg = dict()
cfg['rabbitmq'] = dict()
cfg['rabbitmq_host'] = args.host
cfg['rabbitmq_port'] = args.port
cfg['rabbitmq_exchange'] = args.exchange
cfg['rabbitmq_queue'] = args.queue
cfg['rabbitmq']['user'] = 'admin'
cfg['rabbitmq']['password'] = 'admin'
cfg['rabbitmq']['queue_logs_name'] = 'session_logs'
cfg['rabbitmq']['routing_key_logs'] = 'scripts_log_key'
cfg['rabbitmq']['exchange_logs_name'] = 'download_data_grandlyon_com_logs'
logging.getLogger().setLevel(args.loglevel)
logging.info('Starting...')
while True:
try:
Reindexer(cfg).main()
except pika.exceptions.ChannelClosed:
logging.info("Waiting for tasks...")
time.sleep(5)
except pika.exceptions.AMQPConnectionError:
logging.info('Waiting for RabbitMQ to be reachable...')
time.sleep(5)
except Exception as e:
logging.error(e)
time.sleep(5)
exit(1)
...@@ -12,267 +12,251 @@ sys.path.append(newPath) ...@@ -12,267 +12,251 @@ sys.path.append(newPath)
from lib.my_logging import logging from lib.my_logging import logging
from lib.exit_gracefully import exit_gracefully from lib.exit_gracefully import exit_gracefully
from lib.locker import unlock from lib.locker import unlock
from lib.rabbit_session import RabbitSession
from lib.log_message import LogMessage from lib.log_message import LogMessage
class NotEmptyQueueException(Exception): class NotEmptyQueueException(Exception):
pass pass
class Reindexer : def create_sampling_task(cfg, channel, uuid):
def __init__(self, cfg): # here-below we generate a task for the sample generator (full -> meta)
self.cfg = cfg msg = dict()
self.rabbit = None msg['header'] = dict()
msg['header']['cfg'] = cfg
def create_sampling_task(self, cfg, channel, uuid): #msg['header']['reindex_task_url'] = reindex_task_url
msg['body'] = uuid
# here-below we generate a task for the sample generator (full -> meta)
msg = dict() the_body = msgpack.packb(msg, use_bin_type=True)
msg['header'] = dict()
msg['header']['cfg'] = cfg # connection = pika.BlockingConnection(pika.ConnectionParameters(host=cfg['rabbitmq']['host']))
#msg['header']['reindex_task_url'] = reindex_task_url # channel = connection.channel()
msg['body'] = uuid exchange = cfg['rabbitmq']['exchange']
the_task_body = msgpack.packb(msg, use_bin_type=True) queue_name = cfg['rabbitmq']['queue_name_6']
routing_key = cfg['rabbitmq']['routing_key_6']
# connection = pika.BlockingConnection(pika.ConnectionParameters(host=cfg['rabbitmq']['host'])) queue_logs_name = cfg['rabbitmq']['queue_logs_name']
# channel = connection.channel() routing_key_logs = cfg['rabbitmq']['routing_key_logs']
exchange = cfg['rabbitmq']['exchange'] exchange_logs_name = cfg['rabbitmq']['exchange_logs_name']
queue_name = cfg['rabbitmq']['queue_name_6'] channel.exchange_declare(exchange=cfg['rabbitmq']['exchange'], exchange_type='direct')
routing_key = cfg['rabbitmq']['routing_key_6'] channel.queue_declare(queue=queue_name, durable=True)
channel.queue_bind(exchange=cfg['rabbitmq']['exchange'], queue=queue_name, routing_key=routing_key)
# ------------------------send task----------------------------------- channel.queue_declare(queue=queue_logs_name, durable=True)
self.rabbit.publish_task(the_body=the_task_body, channel.queue_bind(exchange=exchange_logs_name, queue=queue_logs_name, routing_key=routing_key_logs)
exchange=exchange, # ---------------------- send log ----------------------------
routing_key=routing_key, log_message = LogMessage(session_id=cfg['session']['id'],
queue_name=queue_name) # session_id=cfg['session']['id'],
# --------------------------------------------------------------------- uuid=cfg['session']['current_uuid'],
step='reindexer',
status='create sampling task',
uuid_prefix='full',
info=uuid
)
json_body = json.dumps(log_message)
print(" [x] json body : ", json_body)
channel.basic_publish(exchange=exchange_logs_name,
routing_key=routing_key_logs,
body=json_body,
properties=pika.BasicProperties(delivery_mode=2)
)
# ------------------------------------------------------------
channel.basic_publish( exchange=exchange,
routing_key=routing_key,
body=the_body,
properties=pika.BasicProperties(delivery_mode = 2)
)
#connection.close()
return
def on_msg_callback(channel, method, properties, body):
decoded_body = msgpack.unpackb(body, raw=False)
cfg = decoded_body['header']['cfg']
uuid = decoded_body['body']
count_ref = decoded_body['header']['count']
# from lib.elasticsearch_template import template
# template['index_patterns'] = [ cfg['reindexer']['destination_index'] ]
# template['settings']['number_of_shards'] = cfg['reindexer']['number_of_shards']
# template['settings']['number_of_replicas'] = cfg['reindexer']['number_of_replicas']
if 'source_url' in cfg['reindexer'].keys():
es_source = Elasticsearch([cfg['reindexer']['source_url']], timeout=60)
else:
es_source = Elasticsearch([cfg['reindexer']['destination_url']], timeout=60)
# es_logger = logging.getLogger('elasticsearch')
# es_logger.setLevel(logging.INFO)
the_query = dict()
the_query['query'] = dict()
the_query['query']['term'] = {'uuid.keyword': '{0}'.format(uuid)}
es_source.indices.refresh(index=cfg['reindexer']['source_index'])
count_es = es_source.count(cfg['reindexer']['source_index'], body=the_query).get('count')
# logging.debug("%i document(s) found in the source index with uuid = %s" % (count1, uuid))
# if uuid.endswith('.full'):
#
# logging.debug("Waiting for 5 seconds before counting again...")
# time.sleep(5)
#
# es_source.indices.refresh(index=cfg['reindexer']['source_index'])
# count2 = es_source.count(cfg['reindexer']['source_index'], body=the_query).get('count')
# logging.debug("%i document(s) found in the source index with uuid = %s" % (count2, uuid))
#
# if count1 != count2 or count2 == 0:
#
# logging.warning('Documents are still being pushed to the source index. Waiting...')
# time.sleep(5)
# channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
# return
# #raise NotEmptyQueueException('Documents are still being pushed to the source index. Waiting...')
#
# elif uuid.endswith('.meta'):
#
# if count1 != 1:
#
# logging.warning('Documents are still being pushed to the source index. Waiting...')
# time.sleep(5)
# channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
# return
#
# else:
# channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
# logging.error("The uuid ends neither with .full nor with .meta. What shall I do?")
# return
if count_es != count_ref:
logging.warning('Documents are still being pushed to the source index for dataset with uuid = %s' % uuid)
logging.debug('count_es = %i; count_ref = %i' % (count_es, count_ref))
time.sleep(5)
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
return
# channel.exchange_declare(exchange=cfg['rabbitmq']['exchange'], exchange_type='direct')
# channel.queue_declare(queue=queue_name, durable=True)
# channel.queue_bind(exchange=cfg['rabbitmq']['exchange'], queue=queue_name, routing_key=routing_key)
#
# channel.basic_publish( exchange=exchange,
# routing_key=routing_key,
# body=the_body,
# properties=pika.BasicProperties(delivery_mode = 2)
# )
#
#connection.close()
# 1. remove already existing docs from destination index
logging.info("Removing dataset with uuid = %s from the destination index..." % uuid)
es = Elasticsearch([cfg['reindexer']['destination_url']], timeout=60)
index = cfg['reindexer']['destination_index']
try:
es.indices.refresh(index=index)
except NotFoundError:
# the destination index may not be already present
pass
the_query = dict()
the_query['query'] = dict()
the_query['query']['term'] = {'uuid.keyword': '{0}'.format(uuid)}
try:
res = es.delete_by_query(index, doc_type='_doc', body=the_query)
logging.debug(res)
res = es.indices.refresh(index=index)
logging.debug(res)
except NotFoundError:
pass
except Exception as e:
logging.error(e)
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
return return
def on_msg_callback(self, channel, method, properties, body): # # 2. setup template
# try:
# rep = es.indices.delete_template(cfg['reindexer']['template_name'])
# logging.debug(rep)
# except:
# pass
#
# rep = es.indices.put_template(cfg['reindexer']['template_name'], template)
# # rep = es.indices.get_template("template_1")
# logging.debug(rep)
# 3. trigger reindexation
body = {
"source": {
"index": cfg['reindexer']['source_index'],
"query": {
"term": {"uuid.keyword": '{0}'.format(uuid)}
},
"type": "_doc",
"size": 1000
},
"dest": {
"index": cfg['reindexer']['destination_index'],
"type": "_doc"
}
}
decoded_body = msgpack.unpackb(body, raw=False) if 'source_url' in cfg['reindexer'].keys():
body['source']['remote'] = {'host': cfg['reindexer']['source_url']}
# ---------------------- send log ---------------------------- rep = es.reindex(body, wait_for_completion=False)
log_message = LogMessage(session_id=decoded_body['header']['cfg']['session']['id'],
# session_id=cfg['session']['id'],
uuid=decoded_body['header']['cfg']['session']['current_uuid'],
step='reindexer',
status='Starting...',
uuid_prefix='meta',
info='no info'
)
self.rabbit.publish_log(log_message=log_message.__dict__)
# ------------------------------------------------------------
cfg = decoded_body['header']['cfg'] logging.debug(rep)
uuid = decoded_body['body']
count_ref = decoded_body['header']['count']
# from lib.elasticsearch_template import template if 'task' in rep:
# template['index_patterns'] = [ cfg['reindexer']['destination_index'] ] channel.basic_ack(delivery_tag = method.delivery_tag)
# template['settings']['number_of_shards'] = cfg['reindexer']['number_of_shards'] #print("")
# template['settings']['number_of_replicas'] = cfg['reindexer']['number_of_replicas'] reindex_task_url = "{0}/_tasks/{1}".format(cfg['reindexer']['destination_url'], rep['task'])
logging.info("Created reindex task: {0}".format(reindex_task_url))
if 'source_url' in cfg['reindexer'].keys(): # 3. create sampling task (full -> meta)
es_source = Elasticsearch([cfg['reindexer']['source_url']], timeout=60) if '.full' in uuid:
create_sampling_task(cfg, channel, uuid)#, reindex_task_url)
logging.info("Created sampling task.")
# otherwise, remove the lock
else: else:
es_source = Elasticsearch([cfg['reindexer']['destination_url']], timeout=60) logging.info("Removing lock for dataset with uuid = %s." % uuid.replace('.meta', ''))
unlock( cfg['session']['working_directory'], uuid.replace('.meta', ''))
# es_logger = logging.getLogger('elasticsearch')
# es_logger.setLevel(logging.INFO)
the_query = dict()
the_query['query'] = dict()
the_query['query']['term'] = {'uuid.keyword': '{0}'.format(uuid)}
es_source.indices.refresh(index=cfg['reindexer']['source_index'])
count_es = es_source.count(cfg['reindexer']['source_index'], body=the_query).get('count')
# logging.debug("%i document(s) found in the source index with uuid = %s" % (count1, uuid))
# if uuid.endswith('.full'):
#
# logging.debug("Waiting for 5 seconds before counting again...")
# time.sleep(5)
#
# es_source.indices.refresh(index=cfg['reindexer']['source_index'])
# count2 = es_source.count(cfg['reindexer']['source_index'], body=the_query).get('count')
# logging.debug("%i document(s) found in the source index with uuid = %s" % (count2, uuid))
#
# if count1 != count2 or count2 == 0:
#
# logging.warning('Documents are still being pushed to the source index. Waiting...')
# time.sleep(5)
# channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
# return
# #raise NotEmptyQueueException('Documents are still being pushed to the source index. Waiting...')
#
# elif uuid.endswith('.meta'):
#
# if count1 != 1:
#
# logging.warning('Documents are still being pushed to the source index. Waiting...')
# time.sleep(5)
# channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
# return
#
# else:
# channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
# logging.error("The uuid ends neither with .full nor with .meta. What shall I do?")
# return
if count_es != count_ref:
logging.warning('Documents are still being pushed to the source index for dataset with uuid = %s' % uuid)
logging.debug('count_es = %i; count_ref = %i' % (count_es, count_ref))
time.sleep(5)
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
return
# 1. remove already existing docs from destination index
logging.info("Removing dataset with uuid = %s from the destination index..." % uuid)
es = Elasticsearch([cfg['reindexer']['destination_url']], timeout=60) else:
index = cfg['reindexer']['destination_index'] channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
#print("")
logging.error(json.dumps(rep, indent=4))
logging.error("Failed")
try:
es.indices.refresh(index=index)
except NotFoundError:
# the destination index may not be already present
pass
the_query = dict()
the_query['query'] = dict()
the_query['query']['term'] = {'uuid.keyword': '{0}'.format(uuid)}
try: return
res = es.delete_by_query(index, doc_type='_doc', body=the_query)
logging.debug(res)
res = es.indices.refresh(index=index)
logging.debug(res)
except NotFoundError:
pass
except Exception as e:
logging.error(e)
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
return
# # 2. setup template
# try:
# rep = es.indices.delete_template(cfg['reindexer']['template_name'])
# logging.debug(rep)
# except:
# pass
#
# rep = es.indices.put_template(cfg['reindexer']['template_name'], template)
# # rep = es.indices.get_template("template_1")
# logging.debug(rep)
# 3. trigger reindexation
body = {
"source": {
"index": cfg['reindexer']['source_index'],
"query": {
"term": {"uuid.keyword": '{0}'.format(uuid)}
},
"type": "_doc",
"size": 1000
},
"dest": {
"index": cfg['reindexer']['destination_index'],
"type": "_doc"
}
}
if 'source_url' in cfg['reindexer'].keys():
body['source']['remote'] = {'host': cfg['reindexer']['source_url']}
rep = es.reindex(body, wait_for_completion=False) def main(cfg):
logging.debug(rep) #from lib.close_connection import on_timeout
if 'task' in rep: connection = pika.BlockingConnection(pika.ConnectionParameters(host=cfg['rabbitmq_host'], port=cfg['rabbitmq_port']))
channel.basic_ack(delivery_tag=method.delivery_tag) #timeout = 5
#print("") #connection.add_timeout(timeout, on_timeout(connection))
reindex_task_url = "{0}/_tasks/{1}".format(cfg['reindexer']['destination_url'], rep['task']) channel = connection.channel()
logging.info("Created reindex task: {0}".format(reindex_task_url)) exchange = cfg['rabbitmq_exchange']
# the queue this program will consume messages from:
reindex_tasks_to_create_qn = cfg['rabbitmq_queue']
# 3. create sampling task (full -> meta) channel.basic_qos(prefetch_count=1)
if '.full' in uuid: channel.basic_consume(on_message_callback=lambda ch, method, properties, body: on_msg_callback(ch, method, properties, body),
self.create_sampling_task(cfg, channel, uuid)#, reindex_task_url) queue=reindex_tasks_to_create_qn)
logging.info("Created sampling task.")
# otherwise, remove the lock
else:
logging.info("Removing lock for dataset with uuid = %s." % uuid.replace('.meta', ''))
unlock( cfg['session']['working_directory'], uuid.replace('.meta', ''))
channel.start_consuming()
connection.close()
else: return
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
#print("")
logging.error(json.dumps(rep, indent=4))
logging.error("Failed")
# ---------------------- send log ----------------------------
log_message = LogMessage(session_id=decoded_body['header']['cfg']['session']['id'],
# session_id=cfg['session']['id'],
uuid=decoded_body['header']['cfg']['session']['current_uuid'],
step='reindexer',
status='Terminated',
uuid_prefix='meta',
info='no info'
)
self.rabbit.publish_log(log_message=log_message.__dict__)
# ------------------------------------------------------------
return
def main(self):
with RabbitSession(self.cfg) as self.rabbit:
# ------------------------------------------------------------
docs_to_enrich_qn = cfg['rabbitmq_queue']
self.rabbit.consume_queue_and_launch_specific_method(specific_method=self.on_msg_callback,
specific_queue=docs_to_enrich_qn)
# #from lib.close_connection import on_timeout
#
# connection = pika.BlockingConnection(pika.ConnectionParameters(host=cfg['rabbitmq_host'], port=cfg['rabbitmq_port']))
# #timeout = 5
# #connection.add_timeout(timeout, on_timeout(connection))
# channel = connection.channel()
# exchange = cfg['rabbitmq_exchange']
# # the queue this program will consume messages from:
# reindex_tasks_to_create_qn = cfg['rabbitmq_queue']
#
# channel.basic_qos(prefetch_count=1)
# channel.basic_consume(on_message_callback=lambda ch, method, properties, body: on_msg_callback(ch, method, properties, body),
# queue=reindex_tasks_to_create_qn)
#
# channel.start_consuming()
#
# connection.close()
return
if __name__ == '__main__': if __name__ == '__main__':
...@@ -290,27 +274,25 @@ if __name__ == '__main__': ...@@ -290,27 +274,25 @@ if __name__ == '__main__':
parser.add_argument('--exchange', dest='exchange', help='the RabbitMQ exchange', type=str, required=True) parser.add_argument('--exchange', dest='exchange', help='the RabbitMQ exchange', type=str, required=True)
parser.add_argument('--queue', dest='queue', help='the RabbitMQ queue', type=str, required=True) parser.add_argument('--queue', dest='queue', help='the RabbitMQ queue', type=str, required=True)
parser.add_argument('--loglevel', dest='loglevel', help='the log level', default="INFO", type=str, choices=['INFO', 'DEBUG', 'WARN', 'CRITICAL', 'ERROR']) parser.add_argument('--loglevel', dest='loglevel', help='the log level', default="INFO", type=str, choices=['INFO', 'DEBUG', 'WARN', 'CRITICAL', 'ERROR'])
parser.add_argument('--user', dest='user', help='the RabbitMQ user login', type=str, required=True)
parser.add_argument('--password', dest='password', help='the RabbitMQ user password', type=str, required=True)
args = parser.parse_args() args = parser.parse_args()
cfg = dict() cfg = dict()
cfg['rabbitmq'] = dict()
cfg['rabbitmq_host'] = args.host cfg['rabbitmq_host'] = args.host
cfg['rabbitmq_port'] = args.port cfg['rabbitmq_port'] = args.port
cfg['rabbitmq_exchange'] = args.exchange cfg['rabbitmq_exchange'] = args.exchange
cfg['rabbitmq_queue'] = args.queue cfg['rabbitmq_queue'] = args.queue
cfg['rabbitmq']['user'] = 'admin' cfg['rabbitmq']['user'] = args.user
cfg['rabbitmq']['password'] = 'admin' cfg['rabbitmq']['password'] = args.password
cfg['rabbitmq']['queue_logs_name'] = 'session_logs'
cfg['rabbitmq']['routing_key_logs'] = 'scripts_log_key'
cfg['rabbitmq']['exchange_logs_name'] = 'download_data_grandlyon_com_logs'
logging.getLogger().setLevel(args.loglevel) logging.getLogger().setLevel(args.loglevel)
logging.info('Starting...') logging.info('Starting...')
while True: while True:
try: try:
Reindexer(cfg).main() main(cfg)
except pika.exceptions.ChannelClosed: except pika.exceptions.ChannelClosed:
logging.info("Waiting for tasks...") logging.info("Waiting for tasks...")
time.sleep(5) time.sleep(5)
......
import pika
import msgpack
import requests
import json
import time
import os, sys
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import AuthorizationException
fileDir = os.path.dirname(os.path.abspath(__file__))
parentDir = os.path.dirname(fileDir)
newPath = os.path.join(parentDir)
sys.path.append(newPath)
from lib.exit_gracefully import exit_gracefully
from lib.my_logging import logging
from lib.locker import unlock
from lib.rabbit_session import RabbitSession
from lib.log_message import LogMessage
class Sampler:
def __init__(self, cfg):
self.cfg = cfg
self.rabbit = None
def callback(self, channel, method, properties, body):
sample_size = 10
decoded_body = msgpack.unpackb(body, raw=False)
# ---------------------- send log ----------------------------
log_message = LogMessage(session_id=decoded_body['header']['cfg']['session']['id'],
# session_id=cfg['session']['id'],
uuid=decoded_body['header']['cfg']['session']['current_uuid'],
step='sampler',
status='Starting...',
uuid_prefix='meta',
info='no info'
)
self.rabbit.publish_log(log_message=log_message.__dict__)
# ------------------------------------------------------------
cfg = decoded_body['header']['cfg']
#reindex_task_url = decoded_body['header']['reindex_task_url']
uuid = decoded_body['body']
# get sample records from the ingest index
source_es = Elasticsearch([cfg['reindexer']['source_url']], timeout=60)
the_query = dict()
the_query['size'] = sample_size
the_query['query'] = dict()
the_query['query']['term'] = {'uuid.keyword': uuid}
res = source_es.search(cfg['reindexer']['source_index'], '_doc', the_query)
docs_to_index = [ doc['_source'] for doc in res['hits']['hits'] ]
if len(docs_to_index) == 0:
logging.error('Zero documents found for dataset with uuid = %s: sleeping for 5 seconds...' % (uuid))
time.sleep(5)
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
return
# delete the already existing samples
destin_es = Elasticsearch([cfg['reindexer']['destination_url']], timeout=60)
destin_es.indices.refresh(index=cfg['reindexer']['destination_index'])
the_query = dict()
the_query['query'] = dict()
the_query['query']['term'] = {'editorial-metadata.isSample': True}
the_query['query']['term'] = {'uuid.keyword': uuid.replace('.full', '.meta')}
logging.info("Deleting already existing samples for dataset with slug = %s" % docs_to_index[0]['slug'])
try:
res = destin_es.delete_by_query(cfg['reindexer']['destination_index'], doc_type='_doc', body=the_query)
logging.debug(res)
res = destin_es.indices.refresh(index=cfg['reindexer']['destination_index'])
logging.debug(res)
except AuthorizationException: # TODO correct this "Unresolved reference for AuthorizationException"
time.sleep(5)
channel.basic_nack(delivery_tag=method.delivery_tag, requeue=1)
return
except Exception as e:
logging.error("Exception:")
logging.error(e)
logging.error("Exiting.")
exit(1)
t1 = time.time()
# push sample records to the destination index
es_body = ''
header = { "index" : { "_index" : cfg['reindexer']['destination_index'], "_type" : "_doc" } }
for doc in docs_to_index:
doc['editorial-metadata']['isSample'] = True
doc['uuid'] = uuid.replace('.full', '.meta')
es_body += '{0}\n{1}\n'.format(json.dumps(header), json.dumps(doc))
logging.info("Pushing {0} samples to Elasticsearch for dataset {1}...".format(len(docs_to_index), docs_to_index[0]['slug']))
rep = destin_es.bulk(body=es_body)
t2 = time.time()
if rep['errors'] == False:
channel.basic_ack(delivery_tag = method.delivery_tag)
logging.info("Done in %s seconds." % (t2-t1))
destin_es.indices.refresh(index=cfg['reindexer']['destination_index'])
logging.info("Removing lock for dataset with uuid = %s." % uuid.replace('.full', ''))
unlock( cfg['session']['working_directory'], uuid.replace('.full', '') )
else:
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
logging.error(json.dumps(rep, indent=4))
logging.error("Failed")
# else:
#
# time.sleep(5)
# channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
# ---------------------- send log ----------------------------
log_message = LogMessage(session_id=decoded_body['header']['cfg']['session']['id'],
# session_id=cfg['session']['id'],
uuid=decoded_body['header']['cfg']['session']['current_uuid'],
step='sampler',
status='Terminated',
uuid_prefix='meta',
info='no info'
)
self.rabbit.publish_log(log_message=log_message.__dict__)
# ------------------------------------------------------------
return
def main(self):
with RabbitSession(self.cfg) as self.rabbit:
# ------------------------------------------------------------
docs_to_enrich_qn = cfg['rabbitmq_queue']
self.rabbit.consume_queue_and_launch_specific_method(specific_method=self.callback,
specific_queue=docs_to_enrich_qn)
return
if __name__ == "__main__":
import yaml
import time
import signal
import argparse
signal.signal(signal.SIGINT, exit_gracefully)
parser = argparse.ArgumentParser(description='Sample generator')
parser.add_argument('--host', dest='host', help='the RabbitMQ host', type=str, required=True)
parser.add_argument('--port', dest='port', help='the RabbitMQ port', type=int, default=5672)
parser.add_argument('--exchange', dest='exchange', help='the RabbitMQ exchange', type=str, required=True)
parser.add_argument('--queue', dest='queue', help='the RabbitMQ queue', type=str, required=True)
parser.add_argument('--loglevel', dest='loglevel', help='the log level', default="INFO", type=str, choices=['INFO', 'DEBUG', 'WARN', 'CRITICAL', 'ERROR'])
args = parser.parse_args()
cfg = dict()
cfg['rabbitmq'] = dict()
cfg['rabbitmq_host'] = args.host
cfg['rabbitmq_port'] = args.port
cfg['rabbitmq_exchange'] = args.exchange
cfg['rabbitmq_queue'] = args.queue
cfg['rabbitmq']['user'] = 'admin'
cfg['rabbitmq']['password'] = 'admin'
cfg['rabbitmq']['queue_logs_name'] = 'session_logs'
cfg['rabbitmq']['routing_key_logs'] = 'scripts_log_key'
cfg['rabbitmq']['exchange_logs_name'] = 'download_data_grandlyon_com_logs'
logging.getLogger().setLevel(args.loglevel)
logging.info('Starting...')
while True:
try:
Sampler(cfg).main()
except pika.exceptions.ChannelClosed:
logging.info("Waiting for tasks...")
time.sleep(5)
except pika.exceptions.AMQPConnectionError:
logging.info('Waiting for RabbitMQ to be reachable...')
time.sleep(5)
except Exception as e:
logging.error(e)
time.sleep(5)
exit(1)
...@@ -4,8 +4,8 @@ import requests ...@@ -4,8 +4,8 @@ import requests
import json import json
import time import time
import os, sys import os, sys
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch, AuthorizationException
from elasticsearch.exceptions import AuthorizationException
fileDir = os.path.dirname(os.path.abspath(__file__)) fileDir = os.path.dirname(os.path.abspath(__file__))
parentDir = os.path.dirname(fileDir) parentDir = os.path.dirname(fileDir)
...@@ -14,274 +14,290 @@ sys.path.append(newPath) ...@@ -14,274 +14,290 @@ sys.path.append(newPath)
from lib.exit_gracefully import exit_gracefully from lib.exit_gracefully import exit_gracefully
from lib.my_logging import logging from lib.my_logging import logging
from lib.locker import unlock from lib.locker import unlock
from lib.rabbit_session import RabbitSession
from lib.log_message import LogMessage from lib.log_message import LogMessage
class Sampler: def old_callback(channel, method, properties, body):
def __init__(self, cfg): decoded_body = msgpack.unpackb(body, raw=False)
self.cfg = cfg print(decoded_body)
self.rabbit = None
# def old_callback(self, channel, method, properties, body): reindex_task_url = decoded_body['reindex_task_url']
#
# decoded_body = msgpack.unpackb(body, raw=False) res = requests.get(reindex_task_url)
# print(decoded_body) data = res.json()
#
# reindex_task_url = decoded_body['reindex_task_url']
#
# res = requests.get(reindex_task_url)
# data = res.json()
#
# print( json.dumps(data, indent=4) )
#
# if data['completed'] == True:
# print(True)
#
# # delete all the already existing samples
#
# destin_es = Elasticsearch([decoded_body['dest_url']], timeout=60)
# destin_es.indices.refresh(index=decoded_body['dest_index'])
#
# the_query = dict()
# the_query['query'] = dict()
# the_query['query']['term'] = {'editorial-metadata.isSample': True}
#
# res = destin_es.delete_by_query(decoded_body['dest_index'], doc_type='_doc', body=the_query)
#
# print( json.dumps(res) )
#
# #exit(1)
#
# # get sample records from the ingest index
# source_es = Elasticsearch([decoded_body['source_url']], timeout=60)
#
# the_query = dict()
# the_query['size'] = 0
# # the_query['query'] = dict()
# # the_query['query']['exists'] = dict()
# # the_query['query']['exists']['field'] = "data-fr.properties.address.addressCountry"
# the_query['aggs'] = dict()
# the_query['aggs']['my-aggregation'] = dict()
# the_query['aggs']['my-aggregation']['terms'] = dict()
# the_query['aggs']['my-aggregation']['terms']['field'] = "uuid"
# the_query['aggs']['my-aggregation']['terms']['size'] = 2000
#
#
# res = source_es.search(decoded_body['source_index'], '_doc', the_query)
#
# # print( json.dumps(res['aggregations'], indent=4) )
#
#
# print( json.dumps(res['aggregations']['my-aggregation']['buckets'], indent=4) )
#
#
# full_uuids = []
#
# for bucket in res['aggregations']['my-aggregation']['buckets']:
# if '.full' in bucket['key']:
# full_uuids.append(bucket['key'])
#
# print(full_uuids)
#
#
# for uuid in full_uuids:
# print(uuid)
#
# the_query = dict()
# the_query['size'] = 10
# the_query['query'] = dict()
# the_query['query']['match'] = {'uuid': uuid}
#
# res = source_es.search(decoded_body['source_index'], '_doc', the_query)
# print( len(res['hits']['hits']) )
#
# docs_to_index = [ doc['_source'] for doc in res['hits']['hits'] ]
#
# t1 = time.time()
#
# # push sample records to the destination index
# es_body = ''
# header = { "index" : { "_index" : decoded_body['dest_index'], "_type" : "_doc" } }
# for doc in docs_to_index:
# # try:
# # header['index']['_id'] = doc['_id'] #hashlib.md5( json.dumps(doc, sort_keys=True).encode("utf-8") ).hexdigest()
# # del doc['_id']
# # except:
# # header['index']['_id'] = hashlib.md5( json.dumps(doc, sort_keys=True).encode("utf-8") ).hexdigest()
#
# #header['index']['_id'] = hashlib.md5( json.dumps(doc, sort_keys=True).encode("utf-8") ).hexdigest()
#
# doc['editorial-metadata']['isSample'] = True
# doc['uuid'] = uuid.replace('.full', '.meta')
#
# es_body += '{0}\n{1}\n'.format(json.dumps(header), json.dumps(doc))
#
#
# rep = destin_es.bulk(body=es_body)
#
# t2 = time.time()
#
# if rep['errors'] == False:
# channel.basic_ack(delivery_tag = method.delivery_tag)
# logging.info("Done in %s seconds." % (t2-t1))
# destin_es.indices.refresh(index=decoded_body['dest_index'])
# else:
# channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
# logging.error(json.dumps(rep, indent=4))
# logging.error("Failed")
#
# else:
#
# time.sleep(5)
# channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
#
# return
#
def callback(self, channel, method, properties, body): print( json.dumps(data, indent=4) )
sample_size = 10 if data['completed'] == True:
print(True)
decoded_body = msgpack.unpackb(body, raw=False) # delete all the already existing samples
# ---------------------- send log ---------------------------- destin_es = Elasticsearch([decoded_body['dest_url']], timeout=60)
log_message = LogMessage(session_id=decoded_body['header']['cfg']['session']['id'], destin_es.indices.refresh(index=decoded_body['dest_index'])
# session_id=cfg['session']['id'],
uuid=decoded_body['header']['cfg']['session']['current_uuid'],
step='sampler',
status='Starting...',
uuid_prefix='meta',
info='no info'
)
self.rabbit.publish_log(log_message=log_message.__dict__)
# ------------------------------------------------------------
cfg = decoded_body['header']['cfg'] the_query = dict()
#reindex_task_url = decoded_body['header']['reindex_task_url'] the_query['query'] = dict()
uuid = decoded_body['body'] the_query['query']['term'] = {'editorial-metadata.isSample': True}
res = destin_es.delete_by_query(decoded_body['dest_index'], doc_type='_doc', body=the_query)
print( json.dumps(res) )
#res = requests.get(reindex_task_url) #exit(1)
#data = res.json()
# get sample records from the ingest index # get sample records from the ingest index
source_es = Elasticsearch([cfg['reindexer']['source_url']], timeout=60) source_es = Elasticsearch([decoded_body['source_url']], timeout=60)
the_query = dict() the_query = dict()
the_query['size'] = sample_size the_query['size'] = 0
the_query['query'] = dict() # the_query['query'] = dict()
the_query['query']['term'] = {'uuid.keyword': uuid} # the_query['query']['exists'] = dict()
# the_query['query']['exists']['field'] = "data-fr.properties.address.addressCountry"
the_query['aggs'] = dict()
the_query['aggs']['my-aggregation'] = dict()
the_query['aggs']['my-aggregation']['terms'] = dict()
the_query['aggs']['my-aggregation']['terms']['field'] = "uuid"
the_query['aggs']['my-aggregation']['terms']['size'] = 2000
res = source_es.search(cfg['reindexer']['source_index'], '_doc', the_query)
docs_to_index = [ doc['_source'] for doc in res['hits']['hits'] ] res = source_es.search(decoded_body['source_index'], '_doc', the_query)
if len(docs_to_index) == 0: # print( json.dumps(res['aggregations'], indent=4) )
logging.error('Zero documents found for dataset with uuid = %s: sleeping for 5 seconds...' % (uuid))
time.sleep(5)
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
return
# delete the already existing samples
destin_es = Elasticsearch([cfg['reindexer']['destination_url']], timeout=60)
destin_es.indices.refresh(index=cfg['reindexer']['destination_index'])
the_query = dict() print( json.dumps(res['aggregations']['my-aggregation']['buckets'], indent=4) )
the_query['query'] = dict()
the_query['query']['term'] = {'editorial-metadata.isSample': True}
the_query['query']['term'] = {'uuid.keyword': uuid.replace('.full', '.meta')}
logging.info("Deleting already existing samples for dataset with slug = %s" % docs_to_index[0]['slug'])
try:
res = destin_es.delete_by_query(cfg['reindexer']['destination_index'], doc_type='_doc', body=the_query)
logging.debug(res)
res = destin_es.indices.refresh(index=cfg['reindexer']['destination_index'])
logging.debug(res)
except AuthorizationException: # TODO correct this "Unresolved reference for AuthorizationException"
time.sleep(5)
channel.basic_nack(delivery_tag=method.delivery_tag, requeue=1)
return
except Exception as e:
logging.error("Exception:")
logging.error(e)
logging.error("Exiting.")
exit(1)
t1 = time.time() full_uuids = []
# push sample records to the destination index for bucket in res['aggregations']['my-aggregation']['buckets']:
es_body = '' if '.full' in bucket['key']:
header = { "index" : { "_index" : cfg['reindexer']['destination_index'], "_type" : "_doc" } } full_uuids.append(bucket['key'])
for doc in docs_to_index:
print(full_uuids)
doc['editorial-metadata']['isSample'] = True
doc['uuid'] = uuid.replace('.full', '.meta')
for uuid in full_uuids:
es_body += '{0}\n{1}\n'.format(json.dumps(header), json.dumps(doc)) print(uuid)
logging.info("Pushing {0} samples to Elasticsearch for dataset {1}...".format(len(docs_to_index), docs_to_index[0]['slug'])) the_query = dict()
rep = destin_es.bulk(body=es_body) the_query['size'] = 10
the_query['query'] = dict()
t2 = time.time() the_query['query']['match'] = {'uuid': uuid}
if rep['errors'] == False: res = source_es.search(decoded_body['source_index'], '_doc', the_query)
channel.basic_ack(delivery_tag = method.delivery_tag) print( len(res['hits']['hits']) )
logging.info("Done in %s seconds." % (t2-t1))
destin_es.indices.refresh(index=cfg['reindexer']['destination_index']) docs_to_index = [ doc['_source'] for doc in res['hits']['hits'] ]
logging.info("Removing lock for dataset with uuid = %s." % uuid.replace('.full', ''))
unlock( cfg['session']['working_directory'], uuid.replace('.full', '') ) t1 = time.time()
else:
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1) # push sample records to the destination index
logging.error(json.dumps(rep, indent=4)) es_body = ''
logging.error("Failed") header = { "index" : { "_index" : decoded_body['dest_index'], "_type" : "_doc" } }
for doc in docs_to_index:
# else: # try:
# # header['index']['_id'] = doc['_id'] #hashlib.md5( json.dumps(doc, sort_keys=True).encode("utf-8") ).hexdigest()
# time.sleep(5) # del doc['_id']
# channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1) # except:
# ---------------------- send log ---------------------------- # header['index']['_id'] = hashlib.md5( json.dumps(doc, sort_keys=True).encode("utf-8") ).hexdigest()
log_message = LogMessage(session_id=decoded_body['header']['cfg']['session']['id'],
# session_id=cfg['session']['id'], #header['index']['_id'] = hashlib.md5( json.dumps(doc, sort_keys=True).encode("utf-8") ).hexdigest()
uuid=decoded_body['header']['cfg']['session']['current_uuid'],
step='sampler', doc['editorial-metadata']['isSample'] = True
status='Terminated', doc['uuid'] = uuid.replace('.full', '.meta')
uuid_prefix='meta',
info='no info' es_body += '{0}\n{1}\n'.format(json.dumps(header), json.dumps(doc))
)
self.rabbit.publish_log(log_message=log_message.__dict__)
# ------------------------------------------------------------ rep = destin_es.bulk(body=es_body)
t2 = time.time()
if rep['errors'] == False:
channel.basic_ack(delivery_tag = method.delivery_tag)
logging.info("Done in %s seconds." % (t2-t1))
destin_es.indices.refresh(index=decoded_body['dest_index'])
else:
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
logging.error(json.dumps(rep, indent=4))
logging.error("Failed")
else:
time.sleep(5)
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
return
def callback(channel, method, properties, body):
sample_size = 10
decoded_body = msgpack.unpackb(body, raw=False)
cfg = decoded_body['header']['cfg']
#reindex_task_url = decoded_body['header']['reindex_task_url']
uuid = decoded_body['body']
queue_logs_name = cfg['rabbitmq']['queue_logs_name']
routing_key_logs = cfg['rabbitmq']['routing_key_logs']
exchange_logs_name = cfg['rabbitmq']['exchange_logs_name']
channel.queue_declare(queue=queue_logs_name, durable=True)
channel.queue_bind(exchange=exchange_logs_name, queue=queue_logs_name, routing_key=routing_key_logs)
#res = requests.get(reindex_task_url)
#data = res.json()
# get sample records from the ingest index
source_es = Elasticsearch([cfg['reindexer']['source_url']], timeout=60)
the_query = dict()
the_query['size'] = sample_size
the_query['query'] = dict()
the_query['query']['term'] = {'uuid.keyword': uuid}
res = source_es.search(cfg['reindexer']['source_index'], '_doc', the_query)
docs_to_index = [ doc['_source'] for doc in res['hits']['hits'] ]
if len(docs_to_index) == 0:
logging.error('Zero documents found for dataset with uuid = %s: sleeping for 5 seconds...' % (uuid))
time.sleep(5)
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
return return
def main(self): # delete the already existing samples
with RabbitSession(self.cfg) as self.rabbit: destin_es = Elasticsearch([cfg['reindexer']['destination_url']], timeout=60)
# ------------------------------------------------------------ destin_es.indices.refresh(index=cfg['reindexer']['destination_index'])
docs_to_enrich_qn = cfg['rabbitmq_queue']
self.rabbit.consume_queue_and_launch_specific_method(specific_method=self.callback, the_query = dict()
specific_queue=docs_to_enrich_qn) the_query['query'] = dict()
# # es_logger = logging.getLogger('elasticsearch') the_query['query']['term'] = {'editorial-metadata.isSample': True}
# # es_logger.setLevel(logging.INFO) the_query['query']['term'] = {'uuid.keyword': uuid.replace('.full', '.meta')}
#
# #from lib.close_connection import on_timeout logging.info("Deleting already existing samples for dataset with slug = %s" % docs_to_index[0]['slug'])
#
# connection = pika.BlockingConnection(pika.ConnectionParameters(host=cfg['rabbitmq_host'], port=cfg['rabbitmq_port'])) # ---------------------- send log ----------------------------
# #timeout = 5 message = "Deleting already existing samples for dataset with slug =" + str(docs_to_index[0]['slug'])
# #connection.add_timeout(timeout, on_timeout(connection)) log_message = LogMessage(session_id=cfg['session']['id'],
# uuid=cfg['session']['current_uuid'],
# channel = connection.channel() step='sampler',
# # exchange = cfg['rabbitmq_exchange'] status=message,
# # the queue this program will consume messages from: uuid_prefix='full',
# sampling_tasks_qn = cfg['rabbitmq_queue'] info=uuid
# )
# channel.basic_qos(prefetch_count=1)
# channel.basic_consume(on_message_callback=lambda ch, method, properties, body: callback(ch, method, properties, body), json_body = json.dumps(log_message)
# queue=sampling_tasks_qn)
# channel.start_consuming() print(" [x] json body : ", json_body)
# connection.close()
# channel.basic_publish(exchange=exchange_logs_name,
routing_key=routing_key_logs,
body=json_body,
properties=pika.BasicProperties(delivery_mode=2)
)
# ------------------------------------------------------------
try:
res = destin_es.delete_by_query(cfg['reindexer']['destination_index'], doc_type='_doc', body=the_query)
logging.debug(res)
res = destin_es.indices.refresh(index=cfg['reindexer']['destination_index'])
logging.debug(res)
except AuthorizationException:
time.sleep(5)
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
return return
except Exception as e:
logging.error("Exception:")
logging.error(e)
logging.error("Exiting.")
exit(1)
t1 = time.time()
# push sample records to the destination index
es_body = ''
header = { "index" : { "_index" : cfg['reindexer']['destination_index'], "_type" : "_doc" } }
for doc in docs_to_index:
doc['editorial-metadata']['isSample'] = True
doc['uuid'] = uuid.replace('.full', '.meta')
es_body += '{0}\n{1}\n'.format(json.dumps(header), json.dumps(doc))
logging.info("Pushing {0} samples to Elasticsearch for dataset {1}...".format(len(docs_to_index), docs_to_index[0]['slug']))
# ---------------------- send log ----------------------------
message = "Pushing " + str(len(docs_to_index)) + " samples to Elasticsearch for dataset" + str(docs_to_index[0]['slug'])
log_message = LogMessage(session_id=cfg['session']['id'],
uuid=cfg['session']['current_uuid'],
step='sampler',
status=message,
uuid_prefix='full',
info=uuid
)
json_body = json.dumps(log_message)
print(" [x] json body : ", json_body)
channel.basic_publish(exchange=exchange_logs_name,
routing_key=routing_key_logs,
body=json_body,
properties=pika.BasicProperties(delivery_mode=2)
)
# ------------------------------------------------------------
rep = destin_es.bulk(body=es_body)
t2 = time.time()
if rep['errors'] == False:
channel.basic_ack(delivery_tag = method.delivery_tag)
logging.info("Done in %s seconds." % (t2-t1))
destin_es.indices.refresh(index=cfg['reindexer']['destination_index'])
logging.info("Removing lock for dataset with uuid = %s." % uuid.replace('.full', ''))
unlock( cfg['session']['working_directory'], uuid.replace('.full', '') )
else:
channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
logging.error(json.dumps(rep, indent=4))
logging.error("Failed")
# else:
#
# time.sleep(5)
# channel.basic_nack(delivery_tag = method.delivery_tag, requeue=1)
return
def main(cfg):
# es_logger = logging.getLogger('elasticsearch')
# es_logger.setLevel(logging.INFO)
#from lib.close_connection import on_timeout
connection = pika.BlockingConnection(pika.ConnectionParameters(host=cfg['rabbitmq_host'], port=cfg['rabbitmq_port']))
#timeout = 5
#connection.add_timeout(timeout, on_timeout(connection))
channel = connection.channel()
# exchange = cfg['rabbitmq_exchange']
# the queue this program will consume messages from:
sampling_tasks_qn = cfg['rabbitmq_queue']
channel.basic_qos(prefetch_count=1)
channel.basic_consume(on_message_callback=lambda ch, method, properties, body: callback(ch, method, properties, body),
queue=sampling_tasks_qn)
channel.start_consuming()
connection.close()
return
if __name__ == "__main__": if __name__ == "__main__":
...@@ -299,20 +315,18 @@ if __name__ == "__main__": ...@@ -299,20 +315,18 @@ if __name__ == "__main__":
parser.add_argument('--exchange', dest='exchange', help='the RabbitMQ exchange', type=str, required=True) parser.add_argument('--exchange', dest='exchange', help='the RabbitMQ exchange', type=str, required=True)
parser.add_argument('--queue', dest='queue', help='the RabbitMQ queue', type=str, required=True) parser.add_argument('--queue', dest='queue', help='the RabbitMQ queue', type=str, required=True)
parser.add_argument('--loglevel', dest='loglevel', help='the log level', default="INFO", type=str, choices=['INFO', 'DEBUG', 'WARN', 'CRITICAL', 'ERROR']) parser.add_argument('--loglevel', dest='loglevel', help='the log level', default="INFO", type=str, choices=['INFO', 'DEBUG', 'WARN', 'CRITICAL', 'ERROR'])
parser.add_argument('--user', dest='user', help='the RabbitMQ user login', type=str, required=True)
parser.add_argument('--password', dest='password', help='the RabbitMQ user password', type=str, required=True)
args = parser.parse_args() args = parser.parse_args()
cfg = dict() cfg = dict()
cfg['rabbitmq'] = dict()
cfg['rabbitmq_host'] = args.host cfg['rabbitmq_host'] = args.host
cfg['rabbitmq_port'] = args.port cfg['rabbitmq_port'] = args.port
cfg['rabbitmq_exchange'] = args.exchange cfg['rabbitmq_exchange'] = args.exchange
cfg['rabbitmq_queue'] = args.queue cfg['rabbitmq_queue'] = args.queue
cfg['rabbitmq']['user'] = 'admin' cfg['rabbitmq']['user'] = args.user
cfg['rabbitmq']['password'] = 'admin' cfg['rabbitmq']['password'] = args.password
cfg['rabbitmq']['queue_logs_name'] = 'session_logs'
cfg['rabbitmq']['routing_key_logs'] = 'scripts_log_key'
cfg['rabbitmq']['exchange_logs_name'] = 'download_data_grandlyon_com_logs'
logging.getLogger().setLevel(args.loglevel) logging.getLogger().setLevel(args.loglevel)
logging.info('Starting...') logging.info('Starting...')
...@@ -320,7 +334,7 @@ if __name__ == "__main__": ...@@ -320,7 +334,7 @@ if __name__ == "__main__":
while True: while True:
try: try:
Sampler(cfg).main() main(cfg)
except pika.exceptions.ChannelClosed: except pika.exceptions.ChannelClosed:
logging.info("Waiting for tasks...") logging.info("Waiting for tasks...")
time.sleep(5) time.sleep(5)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment