[Django]-How to configure relevance of term on Haystack with Elasticsearch

Without knowing you exact mapping and sample data it will be difficult to tell you why your search returns too many results. However, I assume that your edgengram tokenizer uses a very small starting size of substrings like 1 or 2. With such a setting there are a lot of matches, e.g. if you have the following phrase with a starting size of 1:

a quick brown fox

It will be tokenized the following way:

a q qu qui quick b br bro brow brown f fo fox

Which might produce a lot of matches for a query. As a solution you might use another starting size and fuzzy search to find similar results.

But first, please provide your exact data mapping, sample data and your queries.

Below an example custom backend is provided. The crucial part are the custom types on the bottom of the custom config part and the build_schema function.

Example Custom Backend Config:

HAYSTACK_CONNECTIONS = {
    'default': {
        'ENGINE': 'myservice.apps.search.search_backends.CustomElasticSearchEngine',
        'URL': 'http://127.0.0.1:9200/',
        'INDEX_NAME': 'haystack_prod',
        'TIMEOUT': 60,
    },
}

Example Custom Backend:

from django.conf import settings
from haystack.backends.elasticsearch_backend import ElasticsearchSearchBackend, ElasticsearchSearchEngine
from haystack.fields import NgramField
from haystack.models import SearchResult
import requests
import pyelasticsearch

class CustomElasticBackend(ElasticsearchSearchBackend):
    #DEFAULT_ANALYZER = "snowball"
    DEFAULT_SETTINGS = {
        'settings': {
            "analysis": {
                "analyzer": {
                    "ngram_analyzer": {
                        "type": "custom",
                        "tokenizer": "lowercase",
                        "filter": ["haystack_ngram"]
                    },
                    "edgengram_analyzer": {
                        "type": "custom",
                        "tokenizer": "lowercase",
                        "filter": ["lowercase", "asciifolding", "haystack_edgengram"]
                    },
                    "full_text": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": ["standard", "lowercase", "asciifolding"]
                    },
                    "partial_text": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": ["standard", "lowercase", "asciifolding", "text_ngrams"]
                    },
                    "partial_text_front": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": ["standard", "lowercase", "asciifolding", "text_edgengrams_front"]
                    },
                    "partial_text_back": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": ["standard", "lowercase", "asciifolding", "text_edgengrams_back"]
                    }
                },
                "tokenizer": {
                    "haystack_ngram_tokenizer": {
                        "type": "nGram",
                        "min_gram": 3,
                        "max_gram": 15,
                    },
                    "haystack_edgengram_tokenizer": {
                        "type": "edgeNGram",
                        "min_gram": 3,
                        "max_gram": 15,
                        "side": "front"
                    }
                },
                "filter": {
                    "haystack_ngram": {
                        "type": "nGram",
                        "min_gram": 3,
                        "max_gram": 15
                    },
                    "haystack_edgengram": {
                        "type": "edgeNGram",
                        "min_gram": 3,
                        "max_gram": 15
                    },
                    "text_ngrams": {
                        "type": "nGram",
                        "min_gram": 3,
                        "max_gram": 50
                    },
                    "text_edgengrams_front": {
                        "type": "edgeNGram",
                        "side": "front",
                        "min_gram": 3,
                        "max_gram": 50
                    },
                    "text_edgengrams_back": {
                        "type": "edgeNGram",
                        "side": "back",
                        "min_gram": 3,
                        "max_gram": 50
                    }
                }
            }
        }
    }

    def makemapping(self, index_fieldname):
        return {
            "type": "multi_field",
            "fields": {
                index_fieldname: {"type": "string",
                                  "analyzer": "partial_text",
                                  "include_in_all": True},
                "full": {"type": "string",
                         "analyzer": "full_text",
                         "include_in_all": True},
                "partial": {"type": "string",
                            "index_analyzer": "partial_text",
                            "search_analyzer": "full_text",
                            "include_in_all": True},
                "partial_front": {"type": "string",
                                  "index_analyzer": "partial_text_front",
                                  "search_analyzer": "full_text",
                                  "include_in_all": True},
                "partial_back": {"type": "string",
                                 "index_analyzer": "partial_text_back",
                                 "search_analyzer": "full_text",
                                 "include_in_all": True}
            }
        }

    def emailmapping(self, index_fieldname):
        return {
            "type": "multi_field",
            "fields": {
                index_fieldname: {"type": "string",
                                  "analyzer": "standard"},
                "keyword": {"type": "string",
                            "analyzer": "keyword",
                            "include_in_all": True},
            }
        }

    def makequery(self, param):
        fuzzy_param = param[1:-1] if len(param) > 2 else param
        query = {
            "query": {
                "bool": {
                    "should": [
                        # TODO: bei fuzzy suche funktionniert die autocompletion nicht
                        {"fuzzy_like_this": {"fields": ["text.full"], "like_text": fuzzy_param, "max_query_terms": 12}},
                        {"fuzzy": {"text": {"value": fuzzy_param, "min_similarity": 0.6}}},
                        #{"fuzzy": {"email": fuzzy_param}},
                        #{"fuzzy": {"first_name": fuzzy_param}},
                        #{"fuzzy": {"last_name": fuzzy_param}},
                        # this for the case first name is a CharField
                        #{"match": {"first_name": {"query": param, "boost": 10}}},
                        #{"match": {"last_name": {"query": param, "boost": 10}}},
                        # email
                        #{"text": {"first_name": {"boost": 5, "query": param, "type": "phrase"}}},
                        {"text": {"email": {"boost": 5, "query": param}}},
                        {"text": {"email.keyword": {"boost": 10, "query": param}}},
                        {"text": {"contact_email": {"boost": 5, "query": param}}},
                        {"text": {"contact_email.keyword": {"boost": 10, "query": param}}},
                        {"text": {"contact_email2": {"boost": 5, "query": param}}},
                        {"text": {"contact_email2.keyword": {"boost": 10, "query": param}}},
                        # first_name
                        #{"text": {"first_name": {"boost": 5, "query": param, "type": "phrase"}}},
                        {"text": {"first_name.partial": {"boost": 5, "query": param}}},
                        {"text": {"first_name.partial_front": {"boost": 10, "query": param}}},
                        #{"text": {"first_name.partial_back": {"boost": 4, "query": param}}},
                        # last_name
                        #{"text": {"last_name": {"boost": 5, "query": param, "type": "phrase"}}},
                        {"text": {"last_name.partial": {"boost": 5, "query": param}}},
                        {"text": {"last_name.partial_front": {"boost": 10, "query": param}}},
                        #{"text": {"last_name.partial_back": {"boost": 4, "query": param}}},
                        # company
                        #{"text": {"company": {"boost": 5, "query": param, "type": "phrase"}}},
                        {"text": {"company.partial": {"boost": 5, "query": param}}},
                        {"text": {"company.partial_front": {"boost": 10, "query": param}}},
                        #{"text": {"company.partial_back": {"boost": 4, "query": param}}},
                        # text
                        # ngrams with less accurate results
                        #{"text": {"text": {"boost": 1, "query": param, "type": "phrase"}}},
                        {"text": {"text.partial": {"boost": 3, "query": param, "type": "phrase"}}},
                        {"text": {"text.partial_front": {"boost": 5, "query": param, "type": "phrase"}}},
                        #{"text": {"text.partial_back": {"boost": 5, "query": param, "type": "phrase"}}}
                    ]
                }
            },
            "size": 100
        }
        return query

    def search(self, query_string, **kwargs):
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }

        if not self.setup_complete:
            self.setup()

        search_kwargs = self.build_search_kwargs(query_string, **kwargs)
        search_kwargs['from'] = kwargs.get('start_offset', 0)

        order_fields = set()
        for order in search_kwargs.get('sort', []):
            for key in order.keys():
                order_fields.add(key)

        geo_sort = '_geo_distance' in order_fields

        end_offset = kwargs.get('end_offset')
        start_offset = kwargs.get('start_offset', 0)
        if end_offset is not None and end_offset > start_offset:
            search_kwargs['size'] = end_offset - start_offset

        try:
            raw_results = self.conn.search(search_kwargs,
                                           index=self.index_name,
                                           doc_type='modelresult')
        except (requests.RequestException, pyelasticsearch.ElasticHttpError) as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to query Elasticsearch using '%s': %s", query_string, e)
            raw_results = {}

        return self._process_results(raw_results,
            highlight=kwargs.get('highlight'),
            result_class=kwargs.get('result_class', SearchResult),
            distance_point=kwargs.get('distance_point'), geo_sort=geo_sort)

    def build_search_kwargs(self, query_string, **kwargs):
        if True:
            return self.makequery(query_string)
        else:
            # call original super:
            query = super(CustomElasticBackend, self).build_search_kwargs(query_string, **kwargs)
            return query

    def build_schema(self, fields):
        content_field_name = ''
        mapping = {}

        for field_name, field_class in fields.items():
            if field_class.field_type == 'nameword':
                field_mapping = self.makemapping(field_class.index_fieldname)
            elif field_class.field_type == 'email':
                field_mapping = self.emailmapping(field_class.index_fieldname)
            else:
                field_mapping = {
                    'boost': field_class.boost,
                    'index': 'analyzed',
                    'store': 'yes',
                    'type': 'string',
                }

                if field_class.document is True:
                    content_field_name = field_class.index_fieldname

                # DRL_FIXME: Perhaps move to something where, if none of these
                #            checks succeed, call a custom method on the form that
                #            returns, per-backend, the right type of storage?
                if field_class.field_type in ['date', 'datetime']:
                    field_mapping['type'] = 'date'
                elif field_class.field_type == 'integer':
                    field_mapping['type'] = 'long'
                elif field_class.field_type == 'float':
                    field_mapping['type'] = 'float'
                elif field_class.field_type == 'boolean':
                    field_mapping['type'] = 'boolean'
                elif field_class.field_type == 'ngram':
                    field_mapping['analyzer'] = "ngram_analyzer"
                elif field_class.field_type == 'edge_ngram':
                    field_mapping['analyzer'] = "edgengram_analyzer"
                elif field_class.field_type == 'location':
                    field_mapping['type'] = 'geo_point'

                # The docs claim nothing is needed for multivalue...
                # if field_class.is_multivalued:
                #     field_data['multi_valued'] = 'true'

                if field_class.stored is False:
                    field_mapping['store'] = 'no'

                # Do this last to override `text` fields.
                if field_class.indexed is False or hasattr(field_class, 'facet_for'):
                    field_mapping['index'] = 'not_analyzed'

                if field_mapping['type'] == 'string' and field_class.indexed:
                    field_mapping["term_vector"] = "with_positions_offsets"

                    if not hasattr(field_class, 'facet_for') and not field_class.field_type in('ngram', 'edge_ngram'):
                        field_mapping["analyzer"] = "snowball"

            mapping[field_class.index_fieldname] = field_mapping

        return (content_field_name, mapping)


class CustomElasticSearchEngine(ElasticsearchSearchEngine):
    backend = CustomElasticBackend


class NameWordField(NgramField):
    field_type = 'nameword'


class EmailField(NgramField):
    field_type = 'email'

paweloque

[Django]-Python Celery Threads, Workers and vCPUs

Source:stackexchange.com

Leave a comment Cancel reply