Create a percolator with elasticsearch-dsl-py

Question

Create a percolator with elasticsearch-dsl-py

Navigation

#1 by (1 votes)

1

I have a project for a master's seminar that consists of taking a list of 107605 records of articles and we need to enter the information at a percolator type index to finally enter texts through an interface, percolate them and highlight the related words.

For this we have, by console, the following steps:

We create an index with percolating mapping:

curl -XPUT 'localhost:9200/my-index?pretty' -H 'Content-Type: application/json' -d'
{
   "mappings": {
       "_doc": {
           "properties": {
               "title": {
                   "type": "text"
               },
               "query": {
                   "type": "percolator"
               }
           }
       }
   }
}
'

We enter a record in the index:

curl -XPUT 'localhost:9200/my-index/_doc/1?refresh&pretty' -H 'Content-Type: application/json' -d'
{           
    "CourseId":35,
      "UnitId":12390,
      "id":"16069",
      "CourseName":"ARK102U_ARKEOLOJİK ALAN YÖNETİMİ",
      "FieldId":8,
      "field":"TARİH",
    "query": {
        "span_near" : {
            "clauses" : [
                { "span_term" : { "title" : "dünya" } },
                { "span_term" : { "title" : "mirası" } },
                { "span_term" : { "title" : "sözleşmesi" } }
            ],
            "slop" : 0,
            "in_order" : true
        }
    }
}
'

As we can see, as a query all the words included in the title field of the record will be entered. 3. We make the query by entering the text:

curl -XGET 'localhost:9200/my-index/_search?pretty' -H 'Content-Type: application/json' -d'
{
    "query" : {
        "percolate" : {
            "field" : "query",
            "document" : {
                "title" : "Arkeoloji, arkeolojik yöntemlerle ortaya çıkarılmış kültürleri, dünya mirası sözleşmesi sosyoloji, coğrafya, tarih, etnoloji gibi birçok bilim dalından yararlanarak araştıran ve inceleyen bilim dalıdır. Türkçeye yanlış bir şekilde \"kazıbilim\" olarak çevrilmiş olsa da kazı, arkeolojik araştırma yöntemlerinden sadece bir tanesidir."
            }
        }
    },

    "highlight": {
      "fields": {
        "title": {}
      }
    }
}
'

The records come in a file json and until the momendo I capture them and include them in a dictionary, but from there to there I do not know how to continue. This is my approach:

import json
from elasticsearch_dsl import (
DocType,
Integer,
Percolator,
Text,
)

# Read the json File
json_data = open('titles.json').read()
data = json.loads(json_data)

docs = data['response']['docs']

# Creating a elasticsearch connection
# connections.create_connection(hosts=['localhost'], port=['9200'], timeout=20)
"""
curl -XPUT 'localhost:9200/my-index?pretty' -H 'Content-Type: application/json' -d'
{
    "mappings": {
        "_doc": {
            "properties": {
                "title": {
                    "type": "text"
                },
                "query": {
                    "type": "percolator"
                }
            }
        }
    }
}
'

"""

class Documment(DocType):
    course_id = Integer()
    unit_id = Integer()
    # title = Text()
    id = Integer()
    course_name = Text()
    field_id = Integer()
    field = Text()


    class Meta:
        index = 'titles_index'


                properties={
                    'title': Text(),
                    'query': Percolator()
                 }

"""
    "query": {
        "span_near" : {
            "clauses" : [
                { "span_term" : { "title" : "dünya" } },
                { "span_term" : { "title" : "mirası" } },
                { "span_term" : { "title" : "sözleşmesi" } }
            ],
            "slop" : 0,
            "in_order" : true
        }
    }

"""

for doc in docs:

    terms = docs['title'].split(“ ”)
    course_id = docs['CourseId']
    unit_id = docs['UnitId']
    id = docs['id']
    course_name = docs['CourseName']
    field_id = docs['FieldId']
    field = docs['field']

How should I continue the development?

Thank you very much.

python elasticsearch

asked by SalahAdDin 21.03.2018 в 09:19

source

1 answer

Upload image with codeigniter search in different fields with different inputs mysql php

score 1 · Accepted Answer

Very well, I am going to publish here the answer that I built with the help of another user of the same site but in English:

First I will get all the records from the file in json format:

import json

# Read the json File
json_data = open('titles.json').read()
data = json.loads(json_data)

docs = data['response']['docs']

All records are now in the docs dictionary.

Now we are going to create an object that we will call Documento , taking advantage of the persistence characteristics granted by elasticsearch-dsl-py :

from elasticsearch_dsl import (
    connections,
    DocType,
    Mapping,
    Percolator,
    Text
)

class Document(DocType):
    title = Text()
    query = Percolator()    # query is a percolator

    class Meta:
        index = 'title-index' # index name
        doc_type = '_doc'

    def save(self, **kwargs):
        return super(Document, self).save(**kwargs)

Our record will have as fields a text type title and a percolator type query; the index will be called title-index and the type of document will be _doc .

Do not forget to create a default connection that will be used by the elements of the code:

# creating a new default elasticsearch connection
connections.configure(
    default={'hosts': 'localhost:9200'},
)

Now let's initialize the index:

# create the mappings in elasticsearch
Document.init()

Finally, we will enter in the index each of the records, so that the words that make up the title correspond to a query associated with that element:

# index the query
for doc in docs:
    # convert title to a dictionary
    terms = doc['title'].split(" ")
    # crate a dictionary for clauses
    clauses = []
    for term in terms:
        # each word in terms going to be a SpanTerm
        field = SpanTerm(title=term)
        # add each SpanTerm to clauses
        clauses.append(field)
    # Query going to be a SpanNear query
    query = SpanNear(clauses=clauses, slop=0, in_order=True)
    # Create a new Document item with SpanNear query
    item = Document(query=query)
    # Save item
    item.save()

We have used here some classes defined within the library, this in order to avoid the use of complicated dictionaries to define the queries.

The resulting code is:

import json

from elasticsearch_dsl import (
    connections,
    DocType,
    Mapping,
    Percolator,
    Text
)
from elasticsearch_dsl.query import (
    SpanNear,
    SpanTerm
)
from elasticsearch import Elasticsearch

# Read the json File
json_data = open('titles.json').read()
data = json.loads(json_data)

docs = data['response']['docs']


# creating a new default elasticsearch connection
connections.configure(
    default={'hosts': 'localhost:9200'},
)


class Document(DocType):
    title = Text()
    query = Percolator()    # query is a percolator

    class Meta:
        index = 'title-index' # index name
        doc_type = '_doc'

    def save(self, **kwargs):
        return super(Document, self).save(**kwargs)


# create the mappings in elasticsearch
Document.init()

# index the query
for doc in docs:
    # convert title to a dictionary
    terms = doc['title'].split(" ")
    # crate a dictionary for clauses
    clauses = []
    for term in terms:
        # each word in terms going to be a SpanTerm
        field = SpanTerm(title=term)
        # add each SpanTerm to clauses
        clauses.append(field)
    # Query going to be a SpanNear query
    query = SpanNear(clauses=clauses, slop=0, in_order=True)
    # Create a new Document item with SpanNear query
    item = Document(query=query)
    # Save item
    item.save()