4👍
Without knowing you exact mapping and sample data it will be difficult to tell you why your search returns too many results. However, I assume that your edgengram tokenizer uses a very small starting size of substrings like 1 or 2. With such a setting there are a lot of matches, e.g. if you have the following phrase with a starting size of 1:
a quick brown fox
It will be tokenized the following way:
a q qu qui quick b br bro brow brown f fo fox
Which might produce a lot of matches for a query. As a solution you might use another starting size and fuzzy search to find similar results.
But first, please provide your exact data mapping, sample data and your queries.
Below an example custom backend is provided. The crucial part are the custom types on the bottom of the custom config part and the build_schema function.
Example Custom Backend Config:
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': 'myservice.apps.search.search_backends.CustomElasticSearchEngine',
'URL': 'http://127.0.0.1:9200/',
'INDEX_NAME': 'haystack_prod',
'TIMEOUT': 60,
},
}
Example Custom Backend:
from django.conf import settings
from haystack.backends.elasticsearch_backend import ElasticsearchSearchBackend, ElasticsearchSearchEngine
from haystack.fields import NgramField
from haystack.models import SearchResult
import requests
import pyelasticsearch
class CustomElasticBackend(ElasticsearchSearchBackend):
#DEFAULT_ANALYZER = "snowball"
DEFAULT_SETTINGS = {
'settings': {
"analysis": {
"analyzer": {
"ngram_analyzer": {
"type": "custom",
"tokenizer": "lowercase",
"filter": ["haystack_ngram"]
},
"edgengram_analyzer": {
"type": "custom",
"tokenizer": "lowercase",
"filter": ["lowercase", "asciifolding", "haystack_edgengram"]
},
"full_text": {
"type": "custom",
"tokenizer": "standard",
"filter": ["standard", "lowercase", "asciifolding"]
},
"partial_text": {
"type": "custom",
"tokenizer": "standard",
"filter": ["standard", "lowercase", "asciifolding", "text_ngrams"]
},
"partial_text_front": {
"type": "custom",
"tokenizer": "standard",
"filter": ["standard", "lowercase", "asciifolding", "text_edgengrams_front"]
},
"partial_text_back": {
"type": "custom",
"tokenizer": "standard",
"filter": ["standard", "lowercase", "asciifolding", "text_edgengrams_back"]
}
},
"tokenizer": {
"haystack_ngram_tokenizer": {
"type": "nGram",
"min_gram": 3,
"max_gram": 15,
},
"haystack_edgengram_tokenizer": {
"type": "edgeNGram",
"min_gram": 3,
"max_gram": 15,
"side": "front"
}
},
"filter": {
"haystack_ngram": {
"type": "nGram",
"min_gram": 3,
"max_gram": 15
},
"haystack_edgengram": {
"type": "edgeNGram",
"min_gram": 3,
"max_gram": 15
},
"text_ngrams": {
"type": "nGram",
"min_gram": 3,
"max_gram": 50
},
"text_edgengrams_front": {
"type": "edgeNGram",
"side": "front",
"min_gram": 3,
"max_gram": 50
},
"text_edgengrams_back": {
"type": "edgeNGram",
"side": "back",
"min_gram": 3,
"max_gram": 50
}
}
}
}
}
def makemapping(self, index_fieldname):
return {
"type": "multi_field",
"fields": {
index_fieldname: {"type": "string",
"analyzer": "partial_text",
"include_in_all": True},
"full": {"type": "string",
"analyzer": "full_text",
"include_in_all": True},
"partial": {"type": "string",
"index_analyzer": "partial_text",
"search_analyzer": "full_text",
"include_in_all": True},
"partial_front": {"type": "string",
"index_analyzer": "partial_text_front",
"search_analyzer": "full_text",
"include_in_all": True},
"partial_back": {"type": "string",
"index_analyzer": "partial_text_back",
"search_analyzer": "full_text",
"include_in_all": True}
}
}
def emailmapping(self, index_fieldname):
return {
"type": "multi_field",
"fields": {
index_fieldname: {"type": "string",
"analyzer": "standard"},
"keyword": {"type": "string",
"analyzer": "keyword",
"include_in_all": True},
}
}
def makequery(self, param):
fuzzy_param = param[1:-1] if len(param) > 2 else param
query = {
"query": {
"bool": {
"should": [
# TODO: bei fuzzy suche funktionniert die autocompletion nicht
{"fuzzy_like_this": {"fields": ["text.full"], "like_text": fuzzy_param, "max_query_terms": 12}},
{"fuzzy": {"text": {"value": fuzzy_param, "min_similarity": 0.6}}},
#{"fuzzy": {"email": fuzzy_param}},
#{"fuzzy": {"first_name": fuzzy_param}},
#{"fuzzy": {"last_name": fuzzy_param}},
# this for the case first name is a CharField
#{"match": {"first_name": {"query": param, "boost": 10}}},
#{"match": {"last_name": {"query": param, "boost": 10}}},
# email
#{"text": {"first_name": {"boost": 5, "query": param, "type": "phrase"}}},
{"text": {"email": {"boost": 5, "query": param}}},
{"text": {"email.keyword": {"boost": 10, "query": param}}},
{"text": {"contact_email": {"boost": 5, "query": param}}},
{"text": {"contact_email.keyword": {"boost": 10, "query": param}}},
{"text": {"contact_email2": {"boost": 5, "query": param}}},
{"text": {"contact_email2.keyword": {"boost": 10, "query": param}}},
# first_name
#{"text": {"first_name": {"boost": 5, "query": param, "type": "phrase"}}},
{"text": {"first_name.partial": {"boost": 5, "query": param}}},
{"text": {"first_name.partial_front": {"boost": 10, "query": param}}},
#{"text": {"first_name.partial_back": {"boost": 4, "query": param}}},
# last_name
#{"text": {"last_name": {"boost": 5, "query": param, "type": "phrase"}}},
{"text": {"last_name.partial": {"boost": 5, "query": param}}},
{"text": {"last_name.partial_front": {"boost": 10, "query": param}}},
#{"text": {"last_name.partial_back": {"boost": 4, "query": param}}},
# company
#{"text": {"company": {"boost": 5, "query": param, "type": "phrase"}}},
{"text": {"company.partial": {"boost": 5, "query": param}}},
{"text": {"company.partial_front": {"boost": 10, "query": param}}},
#{"text": {"company.partial_back": {"boost": 4, "query": param}}},
# text
# ngrams with less accurate results
#{"text": {"text": {"boost": 1, "query": param, "type": "phrase"}}},
{"text": {"text.partial": {"boost": 3, "query": param, "type": "phrase"}}},
{"text": {"text.partial_front": {"boost": 5, "query": param, "type": "phrase"}}},
#{"text": {"text.partial_back": {"boost": 5, "query": param, "type": "phrase"}}}
]
}
},
"size": 100
}
return query
def search(self, query_string, **kwargs):
if len(query_string) == 0:
return {
'results': [],
'hits': 0,
}
if not self.setup_complete:
self.setup()
search_kwargs = self.build_search_kwargs(query_string, **kwargs)
search_kwargs['from'] = kwargs.get('start_offset', 0)
order_fields = set()
for order in search_kwargs.get('sort', []):
for key in order.keys():
order_fields.add(key)
geo_sort = '_geo_distance' in order_fields
end_offset = kwargs.get('end_offset')
start_offset = kwargs.get('start_offset', 0)
if end_offset is not None and end_offset > start_offset:
search_kwargs['size'] = end_offset - start_offset
try:
raw_results = self.conn.search(search_kwargs,
index=self.index_name,
doc_type='modelresult')
except (requests.RequestException, pyelasticsearch.ElasticHttpError) as e:
if not self.silently_fail:
raise
self.log.error("Failed to query Elasticsearch using '%s': %s", query_string, e)
raw_results = {}
return self._process_results(raw_results,
highlight=kwargs.get('highlight'),
result_class=kwargs.get('result_class', SearchResult),
distance_point=kwargs.get('distance_point'), geo_sort=geo_sort)
def build_search_kwargs(self, query_string, **kwargs):
if True:
return self.makequery(query_string)
else:
# call original super:
query = super(CustomElasticBackend, self).build_search_kwargs(query_string, **kwargs)
return query
def build_schema(self, fields):
content_field_name = ''
mapping = {}
for field_name, field_class in fields.items():
if field_class.field_type == 'nameword':
field_mapping = self.makemapping(field_class.index_fieldname)
elif field_class.field_type == 'email':
field_mapping = self.emailmapping(field_class.index_fieldname)
else:
field_mapping = {
'boost': field_class.boost,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
}
if field_class.document is True:
content_field_name = field_class.index_fieldname
# DRL_FIXME: Perhaps move to something where, if none of these
# checks succeed, call a custom method on the form that
# returns, per-backend, the right type of storage?
if field_class.field_type in ['date', 'datetime']:
field_mapping['type'] = 'date'
elif field_class.field_type == 'integer':
field_mapping['type'] = 'long'
elif field_class.field_type == 'float':
field_mapping['type'] = 'float'
elif field_class.field_type == 'boolean':
field_mapping['type'] = 'boolean'
elif field_class.field_type == 'ngram':
field_mapping['analyzer'] = "ngram_analyzer"
elif field_class.field_type == 'edge_ngram':
field_mapping['analyzer'] = "edgengram_analyzer"
elif field_class.field_type == 'location':
field_mapping['type'] = 'geo_point'
# The docs claim nothing is needed for multivalue...
# if field_class.is_multivalued:
# field_data['multi_valued'] = 'true'
if field_class.stored is False:
field_mapping['store'] = 'no'
# Do this last to override `text` fields.
if field_class.indexed is False or hasattr(field_class, 'facet_for'):
field_mapping['index'] = 'not_analyzed'
if field_mapping['type'] == 'string' and field_class.indexed:
field_mapping["term_vector"] = "with_positions_offsets"
if not hasattr(field_class, 'facet_for') and not field_class.field_type in('ngram', 'edge_ngram'):
field_mapping["analyzer"] = "snowball"
mapping[field_class.index_fieldname] = field_mapping
return (content_field_name, mapping)
class CustomElasticSearchEngine(ElasticsearchSearchEngine):
backend = CustomElasticBackend
class NameWordField(NgramField):
field_type = 'nameword'
class EmailField(NgramField):
field_type = 'email'