Commit 526116f5 authored by Alessandro CERIONI's avatar Alessandro CERIONI
Browse files

Initial commit.

parents
FROM python:slim
WORKDIR /app
COPY requirements.txt .
COPY lib/*.py ./lib/
COPY templates/*.html ./templates/
COPY api.py .
COPY main.py .
RUN pip install -r requirements.txt
EXPOSE 8000
CMD gunicorn --workers=2 -b 0.0.0.0:8000 api:api
from flask import Flask
from flask import jsonify, render_template
import time
from main import main
api = Flask(__name__, static_url_path='')
@api.route("/")
def hello():
return render_template('index.html')
@api.route("/index/posts")
def index_posts():
task_url = main()
return jsonify( {'reindex_task_url': task_url} )
if __name__ == '__main__':
api.run(host='0.0.0.0', port=8000, debug=True)
content_getter:
wp_api: <the_wordpress_api_url>
wp_username: <the_wordpress_username>
wp_password: <the_wordpress_password>
indexer:
url: <the_elasticsearch_url>
index: <the_elasticsearch_index>
reindexer:
source_url: <the_source_elasticsearch_url>
destination_url: <the_destination_elasticsearch_url>
source_index: <the_source_elasticsearch_index>
destination_index: <the_destination_elasticsearch_index>
template: <the_name_to_use_for_the_elasticsearch_dynamic_template>
version: "3.0"
services:
api:
build: .
ports:
- 8000:8000
volumes:
- ./config.yaml:/app/config.yaml
#import msgpack
import requests
import json
#import pika
def get_token( root_url, username, password ):
url = root_url + '/jwt-auth/v1/token'
payload = {'username': username, 'password': password}
res = requests.post(url, data=payload)
#print(res.json())
token = res.json()['token']
return token
def get_posts_by_page( cfg, no_posts_per_page=10 ):
root_url = cfg['content_getter']['wp_api']
username = cfg['content_getter']['wp_username']
password = cfg['content_getter']['wp_password']
token = get_token(root_url, username, password)
# https://developer.wordpress.org/rest-api/using-the-rest-api/pagination/
headers = {}
headers['Authorization'] = 'Bearer %s' % token
page_no = 1
total_pages = 2
while page_no <= total_pages:
# print('*'*100)
# print('page_no: ', page_no)
res = requests.get( root_url + '/wp/v2/posts?_embed&per_page=%s&page=%s&status=any' % (no_posts_per_page, page_no), headers=headers )
# print(dir(res))
# print(res.url)
#print( json.dumps(res.json(), indent=4) )
#print( res.headers )
#print( res.headers['X-WP-Total'], res.headers['X-WP-TotalPages'])
# exit(1)
total_pages = int(res.headers['X-WP-TotalPages'])
page_no += 1
page = res.json()
yield page
# return None
if __name__ == '__main__':
print('Nothing do. Exiting.')
def process_posts( posts ):
output_posts = []
for post in posts:
output_post = {}
output_post['uuid'] = str(post['id'])
output_post['title'] = post['title']['rendered']
output_post['status'] = post['status']
output_post['content'] = post['content']['rendered']
output_post['excerpt'] = post['excerpt']['rendered']
output_post['categories'] = []
output_post['tags'] = []
for x in post['_embedded']['wp:term']:
output_post['categories'] += [ y['name'] for y in x if y['taxonomy'] == 'category' ]
output_post['tags'] += [ y['name'] for y in x if y['taxonomy'] == 'post_tag' ]
if 'wp:featuredmedia' in post['_embedded'].keys():
output_post['featuredMedia'] = post['_embedded']['wp:featuredmedia'][0]['source_url']
output_post['publicationDate'] = post['date_gmt'] + 'Z'
output_post['modificationDate'] = post['modified_gmt'] + 'Z'
output_post['author'] = post['_embedded']['author'][0]['name']
output_post['slug'] = post['slug']
#print(json.dumps(output_post, indent=4))
#exit(1)
#print(output_post)
#exit(1)
output_posts.append(output_post)
#exit(1)
# print output
return output_posts
if __name__ == '__main__':
posts = [{
"type": "post",
"sticky": "false",
"author": 1,
"id": 21,
"_embedded": {
"author": [
{
"slug": "admin",
"url": "",
"avatar_urls": {
"24": "https://secure.gravatar.com/avatar/e3d458652ea41611151e332349814fa5?s=24&d=mm&r=g",
"96": "https://secure.gravatar.com/avatar/e3d458652ea41611151e332349814fa5?s=96&d=mm&r=g",
"48": "https://secure.gravatar.com/avatar/e3d458652ea41611151e332349814fa5?s=48&d=mm&r=g"
},
"id": 1,
"description": "",
"_links": {
"self": [
{
"href": "https://192.168.62.15:8000/wp-json/wp/v2/users/1"
}
],
"collection": [
{
"href": "https://192.168.62.15:8000/wp-json/wp/v2/users"
}
]
},
"link": "https://192.168.62.15:8000/author/admin/",
"name": "admin"
}
],
"wp:term": [
[
{
"slug": "ma-carte",
"taxonomy": "category",
"id": 3,
"_links": {
"self": [
{
"href": "https://192.168.62.15:8000/wp-json/wp/v2/categories/3"
}
],
"curies": [
{
"href": "https://api.w.org/{rel}",
"name": "wp",
"templated": "true"
}
],
"wp:post_type": [
{
"href": "https://192.168.62.15:8000/wp-json/wp/v2/posts?categories=3"
}
],
"about": [
{
"href": "https://192.168.62.15:8000/wp-json/wp/v2/taxonomies/category"
}
],
"collection": [
{
"href": "https://192.168.62.15:8000/wp-json/wp/v2/categories"
}
]
},
"link": "https://192.168.62.15:8000/category/ma-carte/",
"name": "ma carte"
}
],
[]
]
},
"modified": "2018-05-02T09:46:18",
"content": {
"protected": "false",
"rendered": ""
},
"date": "2018-04-30T11:10:56",
"meta": [],
"template": "",
"tags": [],
"date_gmt": "2018-04-30T09:10:56",
"link": "https://192.168.62.15:8000/le-deuxieme-article-est-super-sympa/",
"excerpt": {
"protected": "false",
"rendered": ""
},
"slug": "le-deuxieme-article-est-super-sympa",
"comment_status": "open",
"_links": {
"collection": [
{
"href": "https://192.168.62.15:8000/wp-json/wp/v2/posts"
}
],
"curies": [
{
"href": "https://api.w.org/{rel}",
"name": "wp",
"templated": "true"
}
],
"author": [
{
"href": "https://192.168.62.15:8000/wp-json/wp/v2/users/1",
"embeddable": "true"
}
],
"wp:term": [
{
"href": "https://192.168.62.15:8000/wp-json/wp/v2/categories?post=21",
"embeddable": "true",
"taxonomy": "category"
},
{
"href": "https://192.168.62.15:8000/wp-json/wp/v2/tags?post=21",
"embeddable": "true",
"taxonomy": "post_tag"
}
],
"predecessor-version": [
{
"href": "https://192.168.62.15:8000/wp-json/wp/v2/posts/21/revisions/26",
"id": 26
}
],
"self": [
{
"href": "https://192.168.62.15:8000/wp-json/wp/v2/posts/21"
}
],
"wp:attachment": [
{
"href": "https://192.168.62.15:8000/wp-json/wp/v2/media?parent=21"
}
],
"version-history": [
{
"count": 3,
"href": "https://192.168.62.15:8000/wp-json/wp/v2/posts/21/revisions"
}
],
"about": [
{
"href": "https://192.168.62.15:8000/wp-json/wp/v2/types/post"
}
],
"replies": [
{
"href": "https://192.168.62.15:8000/wp-json/wp/v2/comments?post=21",
"embeddable": "true"
}
]
},
"format": "standard",
"modified_gmt": "2018-05-02T07:46:18",
"ping_status": "open",
"categories": [
3
],
"guid": {
"rendered": "http://192.168.62.15:8000/?p=21"
},
"featured_media": 0,
"title": {
"rendered": "NOUVEAU ! toutes vos donn\u00e9es Dataly sur une carte"
},
"status": "publish"
}
]
output = process_posts(posts)
#print(output)
# -*- coding: UTF-8 -*-
template = {
#"index_patterns" : ["posts.v1"],
"order" : 0,
"settings" : {
"index.mapping.total_fields.limit": 10000,
#"index.mapping.ignore_malformed": True,
"number_of_shards" : 1,
"number_of_replicas" : 0,
"max_ngram_diff": 100,
"analysis": {
"filter": {
"my_ascii_folding" : {
"type" : "asciifolding",
"preserve_original" : False
},
"my_original_preserving_ascii_folding" : {
"type" : "asciifolding",
"preserve_original" : True
},
"french_elision": {
"type": "elision",
"articles_case": True,
"articles": [
"l", "m", "t", "qu", "n", "s",
"j", "d", "c", "jusqu", "quoiqu",
"lorsqu", "puisqu"
]
},
"french_stop": {
"type": "stop",
"stopwords": "_french_"
},
"preserving_word_delimiter": {
"type": "word_delimiter",
"preserve_original": "true"
},
"protect_keywords": {
"type": "keyword_marker",
"keywords": ["vélo'v"]
}
# "shingle": {
# "type": "shingle",
# "min_shingle_size": 2,
# "max_shingle_size": 3
# }
# "french_keywords": {
# "type": "keyword_marker",
# "keywords": ["Exemple"]
# },
# "french_stemmer": {
# "type": "stemmer",
# "language": "light_french"
# }
},
"tokenizer": {
"my_edge_ngram_tokenizer": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 30,
"token_chars": [
"letter",
"digit"
]
}
},
"analyzer": {
"edge_ngram_analyzer_with_asciifolding": {
"type": "custom",
"tokenizer": "my_edge_ngram_tokenizer",
"filter": [
"lowercase",
"protect_keywords",
"my_original_preserving_ascii_folding",
"french_elision",
"french_stop"#,
#"preserving_word_delimiter"
]
},
"my_search_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"protect_keywords",
# "my_ascii_folding",
"french_elision",
"french_stop"#,
#"preserving_word_delimiter"
]
},
"my_suggest_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"protect_keywords",
# "my_ascii_folding",
"french_elision",
"french_stop"#,
#"preserving_word_delimiter"
]
},
# "trigram": {
# "type": "custom",
# "tokenizer": "standard",
# "filter": ["standard", "shingle"]
# },
# "reverse": {
# "type": "custom",
# "tokenizer": "standard",
# "filter": ["standard", "reverse"]
# }
}
}
},
"mappings" : {
"_doc" : {
"dynamic_templates": [ # priority is given by order!
{
"geoshape-template" : {
"match_pattern": "regex",
"path_match": "metadata-fr.bbox|data-fr.geometry",
"mapping": {
"type": "geo_shape",
"tree": "quadtree",
#"index": "false"
"ignore_malformed": True
}
}
},
{
"link-template" : {
"path_match": "metadata-fr.link",
"mapping": {
#"type": "nested",
"index": "false"
#"ignore_malformed": True
}
}
},
# {
# "exception1-template" : {
# "path_match": "data-fr.properties.date_photo",
# "mapping": {
# "type": "text"
# # "index": False,
# #"ignore_malformed": True
# }
# }
# },
{
"date-template" : {
"match_mapping_type": "date",
# "path_match": "data-fr\.properties\.date.+|data-fr\.properties\.last_update.*|metadata-fr\.creationDate",
"mapping": {
"type": "date",
"format": "strict_date_optional_time",#||epoch_millis||yyyy-MM-dd HH:mm:ss",
"fields": {
"sort": {
"type": "date"
}
}
# "index": False,
#"ignore_malformed": True
}
}
},
# {
# "unindexed-field-template": {
# "match_pattern": "regex",
# "match": "url|href",
# "mapping": {
# # "type": "keyword",
# "index": False,
# "ignore_malformed": True
# }
# }
# },
{
"unindexed-path-template": {
"match_pattern": "regex",
"match_mapping_type": "*",
"path_match": "metadata-fr\.href.*|metadata-fr\.idxMsg.*|data-fr\.geometry\..*|metadata-fr\.identifier.*|metadata-fr\.geonet\:info\.@xmlns:geonet|metadata-fr\.responsibleParty\.logo|metadata-fr\.image\..*|.*url|metadata-fr\.link\.name",
# "match": "(metadata-fr\.image.*|data-fr\.geometry.*|metadata-fr\.href.*|metadata-fr\.idxMsg.*)",
"mapping": {
"type": "text",
#"ignore_malformed": True
"index": False
}
}
},
{
"long-template": {
"match_mapping_type": "long",
"mapping": {
"type": "long",
"fields": {
"sort":
{
"type": "long"
}
}
}
}
},
{
"double-template": {
"match_mapping_type": "double",
"mapping": {
"type": "double",
"fields": {
"sort":
{
"type": "double"
}
}
}
}
},
{
"boolean-template": {
"match_mapping_type": "boolean",
"mapping": {
"type": "boolean",
"fields": {
"sort":
{
"type": "boolean"
}
}
}
}
},
# {
# "exception1-template": {
# "match_pattern": "regex",
# "match": "data-fr.properties.datemajgraph|data-fr.properties.date_creation", #test-q-plus-wfs.c1b069ca-181d-4265-9838-8d182f207bd3.ingest.v6
# "mapping": {
# "type": "date",
# "ignore_malformed": True
# }
# }
# },
# {
# "exception2-template": {
# "match_mapping_type": "long",
# # "match": "numero", #test-q-plus-wfs.c1b069ca-181d-4265-9838-8d182f207bd3.ingest.v6
# "mapping": {
# "type": "long",v9
# "ignore_malformed": True
# }
# }
# },
# {
# "exception3-template": {
# "match_pattern": "regex",
# "match": "data-fr\.properties\.address", #test-q-plus-wfs.c1b069ca-181d-4265-9838-8d182f207bd3.ingest.v6
# "mapping": {
# "type": "object",
# "ignore_malformed": True
# }
# }
# },
# {