Reworked search query (#197)

- still muse multi_match but now with cross_fields type
- add edge n-gram index field since cross_fields works only with match
  and this way we can have prefix support
- suffix support is still supported as before by reversing the words in
  data and query
- for query we now create variations of all multi_match queries. from 2
  words you get 4 queries, from 3 works you get 8 queries and so on.
This commit is contained in:
Rok Garbas 2020-09-23 14:39:36 +02:00 committed by GitHub
parent 5a65c6f94b
commit 86ad9d036d
Failed to generate hash of commit
5 changed files with 180 additions and 79 deletions

View file

@ -1 +1 @@
12 13

View file

@ -34,7 +34,22 @@ ANALYSIS = {
"normalizer": { "normalizer": {
"lowercase": {"type": "custom", "char_filter": [], "filter": ["lowercase"]} "lowercase": {"type": "custom", "char_filter": [], "filter": ["lowercase"]}
}, },
"tokenizer": {
"edge": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 50,
"token_chars": [
"letter",
"digit",
# Either we use them or we would need to strip them before that.
"punctuation",
"symbol",
],
},
},
"analyzer": { "analyzer": {
"edge": {"tokenizer": "edge"},
"lowercase": { "lowercase": {
"type": "custom", "type": "custom",
"tokenizer": "keyword", "tokenizer": "keyword",
@ -65,22 +80,67 @@ MAPPING = {
"drv_path": {"type": "keyword"}, "drv_path": {"type": "keyword"},
}, },
}, },
"package_attr_name": {"type": "keyword", "normalizer": "lowercase"}, "package_attr_name": {
"package_attr_name_reverse": {"type": "keyword", "normalizer": "lowercase"}, "type": "keyword",
"package_attr_name_query": {"type": "keyword", "normalizer": "lowercase"}, "normalizer": "lowercase",
"fields": {"edge": {"type": "text", "analyzer": "edge"}},
},
"package_attr_name_reverse": {
"type": "keyword",
"normalizer": "lowercase",
"fields": {"edge": {"type": "text", "analyzer": "edge"}},
},
"package_attr_name_query": {
"type": "keyword",
"normalizer": "lowercase",
"fields": {"edge": {"type": "text", "analyzer": "edge"}},
},
"package_attr_name_query_reverse": { "package_attr_name_query_reverse": {
"type": "keyword", "type": "keyword",
"normalizer": "lowercase", "normalizer": "lowercase",
"fields": {"edge": {"type": "text", "analyzer": "edge"}},
},
"package_attr_set": {
"type": "keyword",
"normalizer": "lowercase",
"fields": {"edge": {"type": "text", "analyzer": "edge"}},
},
"package_attr_set_reverse": {
"type": "keyword",
"normalizer": "lowercase",
"fields": {"edge": {"type": "text", "analyzer": "edge"}},
},
"package_pname": {
"type": "keyword",
"normalizer": "lowercase",
"fields": {"edge": {"type": "text", "analyzer": "edge"}},
},
"package_pname_reverse": {
"type": "keyword",
"normalizer": "lowercase",
"fields": {"edge": {"type": "text", "analyzer": "edge"}},
}, },
"package_attr_set": {"type": "keyword", "normalizer": "lowercase"},
"package_attr_set_reverse": {"type": "keyword", "normalizer": "lowercase"},
"package_pname": {"type": "keyword", "normalizer": "lowercase"},
"package_pname_reverse": {"type": "keyword", "normalizer": "lowercase"},
"package_pversion": {"type": "keyword"}, "package_pversion": {"type": "keyword"},
"package_description": {"type": "text", "analyzer": "english"}, "package_description": {
"package_description_reverse": {"type": "text", "analyzer": "english"}, "type": "text",
"package_longDescription": {"type": "text", "analyzer": "english"}, "analyzer": "english",
"package_longDescription_reverse": {"type": "text", "analyzer": "english"}, "fields": {"edge": {"type": "text", "analyzer": "edge"}},
},
"package_description_reverse": {
"type": "text",
"analyzer": "english",
"fields": {"edge": {"type": "text", "analyzer": "edge"}},
},
"package_longDescription": {
"type": "text",
"analyzer": "english",
"fields": {"edge": {"type": "text", "analyzer": "edge"}},
},
"package_longDescription_reverse": {
"type": "text",
"analyzer": "english",
"fields": {"edge": {"type": "text", "analyzer": "edge"}},
},
"package_license": { "package_license": {
"type": "nested", "type": "nested",
"properties": {"fullName": {"type": "text"}, "url": {"type": "text"}}, "properties": {"fullName": {"type": "text"}, "url": {"type": "text"}},
@ -98,12 +158,36 @@ MAPPING = {
"package_homepage": {"type": "keyword"}, "package_homepage": {"type": "keyword"},
"package_system": {"type": "keyword"}, "package_system": {"type": "keyword"},
# Options fields # Options fields
"option_name": {"type": "keyword", "normalizer": "lowercase"}, "option_name": {
"option_name_reverse": {"type": "keyword", "normalizer": "lowercase"}, "type": "keyword",
"option_name_query": {"type": "keyword", "normalizer": "lowercase"}, "normalizer": "lowercase",
"option_name_query_reverse": {"type": "keyword", "normalizer": "lowercase"}, "fields": {"edge": {"type": "text", "analyzer": "edge"}},
"option_description": {"type": "text", "analyzer": "english"}, },
"option_description_reverse": {"type": "text", "analyzer": "english"}, "option_name_reverse": {
"type": "keyword",
"normalizer": "lowercase",
"fields": {"edge": {"type": "text", "analyzer": "edge"}},
},
"option_name_query": {
"type": "keyword",
"normalizer": "lowercase",
"fields": {"edge": {"type": "text", "analyzer": "edge"}},
},
"option_name_query_reverse": {
"type": "keyword",
"normalizer": "lowercase",
"fields": {"edge": {"type": "text", "analyzer": "edge"}},
},
"option_description": {
"type": "text",
"analyzer": "english",
"fields": {"edge": {"type": "text", "analyzer": "edge"}},
},
"option_description_reverse": {
"type": "text",
"analyzer": "english",
"fields": {"edge": {"type": "text", "analyzer": "edge"}},
},
"option_type": {"type": "keyword"}, "option_type": {"type": "keyword"},
"option_default": {"type": "text"}, "option_default": {"type": "text"},
"option_example": {"type": "text"}, "option_example": {"type": "text"},

View file

@ -288,8 +288,8 @@ makeRequest options channel query from size sort =
sort sort
"option" "option"
"option_name" "option_name"
[ ( "option_name", 2.2 ) [ ( "option_name", 6.0 )
, ( "option_name_query", 2.0 ) , ( "option_name_query", 3.0 )
, ( "option_description", 1.0 ) , ( "option_description", 1.0 )
] ]
) )

View file

@ -414,10 +414,10 @@ makeRequest options channel query from size sort =
sort sort
"package" "package"
"package_attr_name" "package_attr_name"
[ ( "package_attr_name", 2.4 ) [ ( "package_attr_name", 9.0 )
, ( "package_pname", 2.2 ) , ( "package_pname", 6.0 )
, ( "package_attr_name_query", 2.0 ) , ( "package_attr_name_query", 4.0 )
, ( "package_description", 1.2 ) , ( "package_description", 1.3 )
, ( "package_longDescription", 1.0 ) , ( "package_longDescription", 1.0 )
] ]
) )

View file

@ -62,6 +62,7 @@ import Http
import Json.Decode import Json.Decode
import Json.Encode import Json.Encode
import RemoteData import RemoteData
import Set
import Task import Task
import Url.Builder import Url.Builder
@ -753,38 +754,56 @@ filter_by_type type_ =
] ]
search_fields : searchFields :
Float String
-> List String
-> List ( String, Float ) -> List ( String, Float )
-> List (List ( String, Json.Encode.Value )) -> List (List ( String, Json.Encode.Value ))
search_fields baseScore queryWords fields = searchFields query fields =
queryWords let
|> List.reverse queryVariations q =
|> List.indexedMap case ( List.head q, List.tail q ) of
(\queryIndex queryWord -> ( Just h, Just t ) ->
let
tail : List (List String)
tail =
queryVariations t
in
List.append
(List.map (\x -> List.append [ h ] x) tail)
(List.map (\x -> List.append [ String.reverse h ] x) tail)
|> Set.fromList
|> Set.toList
( Just h, Nothing ) ->
[ [ h ], [ String.reverse h ] ]
( _, _ ) ->
[ [], [] ]
reverseFields =
List.map (\( field, score ) -> ( field ++ "_reverse", score * 0.8 )) fields
allFields =
List.append fields reverseFields
|> List.map (\( field, score ) -> [ field ++ "^" ++ String.fromFloat score, field ++ ".edge^" ++ String.fromFloat score ])
|> List.concat
in
List.map
(\queryWords ->
[ ( "multi_match" [ ( "multi_match"
, Json.Encode.object , Json.Encode.object
[ ( "type", Json.Encode.string "bool_prefix" ) [ ( "type", Json.Encode.string "cross_fields" )
, ( "query", Json.Encode.string queryWord ) , ( "query", Json.Encode.string <| String.join " " queryWords )
, ( "analyzer", Json.Encode.string "lowercase" ) , ( "analyzer", Json.Encode.string "whitespace" )
, ( "auto_generate_synonyms_phrase_query", Json.Encode.bool False ) , ( "auto_generate_synonyms_phrase_query", Json.Encode.bool False )
, ( "prefix_length", Json.Encode.int 3 ) , ( "operator", Json.Encode.string "and" )
, ( "operator", Json.Encode.string "or" ) , ( "_name", Json.Encode.string <| "multi_match_" ++ String.join "_" queryWords )
, ( "_name" , ( "fields", Json.Encode.list Json.Encode.string allFields )
, Json.Encode.string <| "multi_match_" ++ queryWord ++ "_" ++ (queryIndex + 1 |> String.fromInt)
)
, ( "fields"
, Json.Encode.list Json.Encode.string
(List.map
(\( field, score ) -> field ++ "^" ++ (baseScore * (score + (0.1 * (queryIndex + 1 |> toFloat))) |> String.fromFloat))
fields
)
)
] ]
) )
] ]
) )
(queryVariations (String.words query))
makeRequestBody : makeRequestBody :
@ -830,31 +849,29 @@ makeRequestBody query from sizeRaw sort type_ sortField fields =
[ ( "tie_breaker", Json.Encode.float 0.7 ) [ ( "tie_breaker", Json.Encode.float 0.7 )
, ( "queries" , ( "queries"
, Json.Encode.list Json.Encode.object , Json.Encode.list Json.Encode.object
[ [ ( "bool" (searchFields query fields)
, Json.Encode.object -- [ [ ( "bool"
[ ( "must" -- , Json.Encode.object
, Json.Encode.list Json.Encode.object <| -- [ ( "must"
search_fields -- , Json.Encode.list Json.Encode.object <|
1.0 -- searchFields query fields
(String.words query) -- )
fields -- ]
) -- )
] -- ]
) -- ]
] -- , [ ( "bool"
, [ ( "bool" -- , Json.Encode.object
, Json.Encode.object -- [ ( "must"
[ ( "must" -- , Json.Encode.list Json.Encode.object <|
, Json.Encode.list Json.Encode.object <| -- searchFields
search_fields -- 0.8
0.8 -- (String.words query |> List.map String.reverse)
(String.words query |> List.map String.reverse) -- )
(List.map (\( field, score ) -> ( field ++ "_reverse", score )) fields) -- ]
) -- )
] -- ]
) --]
]
]
) )
] ]
) )