Improve option search ranking (#107)

This commit is contained in:
Rok Garbas 2020-06-19 08:53:49 +02:00 committed by GitHub
parent bbc97d17f0
commit 3ecce08a94
Failed to generate hash of commit
8 changed files with 343 additions and 386 deletions

View file

@ -1 +1 @@
7
8

View file

@ -49,83 +49,6 @@ ANALYSIS = {
"tokenizer": "keyword",
"filter": ["lowercase"],
},
"nixOptionName": {
"type": "custom",
"tokenizer": "nix_option_name",
"filter": ["lowercase"],
},
"nixOptionNameGranular": {
"type": "custom",
"tokenizer": "nix_option_name_granular",
"filter": ["lowercase"],
},
},
"tokenizer": {
"nix_package_query": {"type": "pattern", "pattern": "|".join(["[ ]"])},
"nix_package_attr_name": {
"type": "pattern",
# Split on attrname separators like _, .
"pattern": "|".join(
[
"[_.-]", # Common separators like underscores, dots and dashes
"\\d+?Packages", # python37Packages -> python
"\\d+?Plugins", # vimPlugins -> vim
"\\d+?Extensions", # php74Extensions -> php
"\\d+?Interpreters", # perlInterpreters -> perl
# Camelcase tokenizer adapted from
# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html
"".join(
[
"(?<=[\\p{L}&&[^\\p{Lu}]])" # lower case
"(?=\\p{Lu})", # followed by upper case
"|",
"(?<=\\p{Lu})" # or upper case
"(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])", # followed by lower case
]
),
]
),
},
"nix_option_name": {"type": "pattern", "pattern": "[.]"},
# Lower priority (virtualHost -> [virtual, host])
"nix_option_name_granular": {
"type": "pattern",
# Split on attrname separators like _, .
"pattern": "|".join(
[
"[_.-]", # Common separators like underscores, dots and dashes
# Camelcase tokenizer adapted from
# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html
"".join(
[
"(?<=[\\p{L}&&[^\\p{Lu}]])" # lower case
"(?=\\p{Lu})", # followed by upper case
"|",
"(?<=\\p{Lu})" # or upper case
"(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])", # followed by lower case
]
),
]
),
},
},
"filter": {
"nix_stopwords": {
"type": "stop",
"ignore_case": True,
"stopwords": [
"packages",
"package",
"options",
"option",
"plugins",
"plugin",
"extensions",
"extension",
"interpreters",
"interpreter",
],
},
},
}
MAPPING = {
@ -175,15 +98,8 @@ MAPPING = {
"package_homepage": {"type": "keyword"},
"package_system": {"type": "keyword"},
# Options fields
"option_name": {
"type": "text",
"analyzer": "nixOptionName",
"fielddata": True,
"fields": {
"raw": {"type": "keyword"},
"granular": {"type": "text", "analyzer": "nixOptionNameGranular"},
},
},
"option_name": {"type": "keyword", "normalizer": "lowercase"},
"option_name_query": {"type": "keyword", "normalizer": "lowercase"},
"option_description": {"type": "text"},
"option_type": {"type": "keyword"},
"option_default": {"type": "text"},
@ -340,7 +256,8 @@ def remove_attr_set(name):
# Plugins
"elasticsearch",
"graylog",
"tmuxplugin" "vimplugin",
"tmuxplugin",
"vimplugin",
]
# TODO: is this correct
if any([name.startswith(i) for i in sets]):
@ -495,6 +412,7 @@ def get_options(evaluation):
yield dict(
type="option",
option_name=name,
option_name_query=split_query(name),
option_description=description,
option_type=option.get("type"),
option_default=default,

View file

@ -1,28 +1,38 @@
module Page.Home exposing (Model, Msg, init, update, view)
import Html exposing (Html, text, div )
import Html exposing (Html, div, text)
-- MODEL
type alias Model = ()
type alias Model =
()
init : (Model, Cmd Msg)
init : ( Model, Cmd Msg )
init =
((), Cmd.none)
( (), Cmd.none )
-- UPDATE
type Msg = NoOp
type Msg
= NoOp
update : Msg -> Model -> ( Model, Cmd Msg )
update msg model =
(model, Cmd.none)
( model, Cmd.none )
-- VIEW
view : Model -> Html Msg
view model =
div [] [text "Home"]
div [] [ text "Home" ]

View file

@ -17,6 +17,8 @@ import Html
, div
, dl
, dt
, li
, p
, pre
, span
, table
@ -26,6 +28,7 @@ import Html
, th
, thead
, tr
, ul
)
import Html.Attributes
exposing
@ -42,6 +45,7 @@ import Html.Parser.Util
import Http
import Json.Decode
import Json.Encode
import Regex
import Search
@ -145,10 +149,30 @@ viewResultItem show item =
else
[]
in
tr [ onClick (SearchMsg (Search.ShowDetails item.source.name)) ]
[ td [] [ text item.source.name ]
]
:: packageDetails
[]
-- DEBUG: |> List.append
-- DEBUG: [ tr []
-- DEBUG: [ td [ colspan 1 ]
-- DEBUG: [ p [] [ text <| "score: " ++ String.fromFloat item.score ]
-- DEBUG: , p []
-- DEBUG: [ text <|
-- DEBUG: "matched queries: "
-- DEBUG: , ul []
-- DEBUG: (item.matched_queries
-- DEBUG: |> Maybe.withDefault []
-- DEBUG: |> List.sort
-- DEBUG: |> List.map (\q -> li [] [ text q ])
-- DEBUG: )
-- DEBUG: ]
-- DEBUG: ]
-- DEBUG: ]
-- DEBUG: ]
|> List.append
(tr [ onClick (SearchMsg (Search.ShowDetails item.source.name)) ]
[ td [] [ text item.source.name ]
]
:: packageDetails
)
viewResultItemDetails :
@ -229,160 +253,6 @@ viewResultItemDetails item =
-- API
makeRequestBody :
String
-> Int
-> Int
-> Http.Body
makeRequestBody query from size =
-- Prefix Query
-- example query for "python"
-- {
-- "from": 0,
-- "size": 1000,
-- "query": {
-- "bool": {
-- "must": {
-- "bool": {
-- "should": [
-- {
-- "term": {
-- "option_name.raw": {
-- "value": "nginx",
-- "boost": 2.0
-- }
-- }
-- },
-- {
-- "term": {
-- "option_name": {
-- "value": "nginx",
-- "boost": 1.0
-- }
-- }
-- },
-- {
-- "term": {
-- "option_name.granular": {
-- "value": "nginx",
-- "boost": 0.6
-- }
-- }
-- },
-- {
-- "term": {
-- "option_description": {
-- "value": "nginx",
-- "boost": 0.3
-- }
-- }
-- }
-- ]
-- }
-- },
-- "filter": [
-- {
-- "match": {
-- "type": "option"
-- }
-- }
-- ]
-- }
-- },
-- "rescore" : {
-- "window_size": 500,
-- "query" : {
-- "score_mode": "total",
-- "rescore_query" : {
-- "function_score" : {
-- "script_score": {
-- "script": {
-- "source": "
-- int i = 1;
-- for (token in doc['option_name.raw'][0].splitOnToken('.')) {
-- if (token == \"nginx\") {
-- return 10000 - (i * 100);
-- }
-- i++;
-- }
-- return 10;
-- "
-- }
-- }
-- }
-- }
-- }
-- }
-- }
let
stringIn name value =
[ ( name, Json.Encode.string value ) ]
listIn name type_ value =
[ ( name, Json.Encode.list type_ value ) ]
objectIn name value =
[ ( name, Json.Encode.object value ) ]
encodeTerm ( name, boost ) =
[ ( "term"
, Json.Encode.object
[ ( name
, Json.Encode.object
[ ( "value", Json.Encode.string query )
, ( "boost", Json.Encode.float boost )
]
)
]
)
]
in
[ ( "option_name.raw", 2.0 )
, ( "option_name", 1.0 )
, ( "option_name.granular", 0.6 )
, ( "option_description", 0.3 )
]
|> List.map encodeTerm
|> listIn "should" Json.Encode.object
|> objectIn "bool"
|> objectIn "must"
|> ([ ( "type", Json.Encode.string "option" ) ]
|> objectIn "match"
|> objectIn "filter"
|> List.append
)
|> objectIn "bool"
|> objectIn "query"
|> List.append
("""int i = 1;
for (token in doc['option_name.raw'][0].splitOnToken('.')) {
if (token == '"""
++ query
++ """') {
return 10000 - (i * 100);
}
i++;
}
return 10;
"""
|> stringIn "source"
|> objectIn "script"
|> objectIn "script_score"
|> objectIn "function_score"
|> objectIn "rescore_query"
|> List.append ("total" |> stringIn "score_mode")
|> objectIn "query"
|> List.append [ ( "window_size", Json.Encode.int 1000 ) ]
|> objectIn "rescore"
)
|> List.append
[ ( "from", Json.Encode.int from )
, ( "size", Json.Encode.int size )
]
|> Json.Encode.object
|> Http.jsonBody
makeRequest :
Search.Options
-> String
@ -390,9 +260,123 @@ makeRequest :
-> Int
-> Int
-> Cmd Msg
makeRequest options channel query from size =
makeRequest options channel queryRaw from size =
let
query =
queryRaw
|> String.trim
delimiters =
Maybe.withDefault Regex.never (Regex.fromString "[. ]")
should_match boost_base =
List.indexedMap
(\i ( field, boost ) ->
[ ( "match"
, Json.Encode.object
[ ( field
, Json.Encode.object
[ ( "query", Json.Encode.string query )
, ( "boost", Json.Encode.float boost )
, ( "analyzer", Json.Encode.string "whitespace" )
, ( "fuzziness", Json.Encode.string "1" )
, ( "_name"
, Json.Encode.string <|
"should_match_"
++ String.fromInt (i + 1)
)
]
)
]
)
]
)
[ ( "option_name", 1 )
, ( "option_name_query", 1 )
, ( "option_description", 1 )
]
should_match_bool_prefix boost_base =
List.indexedMap
(\i ( field, boost ) ->
[ ( "match_bool_prefix"
, Json.Encode.object
[ ( field
, Json.Encode.object
[ ( "query", Json.Encode.string query )
, ( "boost", Json.Encode.float boost )
, ( "analyzer", Json.Encode.string "whitespace" )
, ( "fuzziness", Json.Encode.string "1" )
, ( "_name"
, Json.Encode.string <|
"should_match_bool_prefix_"
++ String.fromInt (i + 1)
)
]
)
]
)
]
)
[ ( "option_name", 1 )
, ( "option_name_query", 1 )
]
should_terms boost_base =
List.indexedMap
(\i ( field, boost ) ->
[ ( "terms"
, Json.Encode.object
[ ( field
, Json.Encode.list Json.Encode.string (Regex.split delimiters query)
)
, ( "boost", Json.Encode.float <| boost_base * boost )
, ( "_name"
, Json.Encode.string <|
"should_terms_"
++ String.fromInt (i + 1)
)
]
)
]
)
[ ( "option_name", 1 )
, ( "option_name_query", 1 )
]
should_term boost_base =
List.indexedMap
(\i ( field, boost ) ->
[ ( "term"
, Json.Encode.object
[ ( field
, Json.Encode.object
[ ( "value", Json.Encode.string query )
, ( "boost", Json.Encode.float <| boost_base * boost )
, ( "_name"
, Json.Encode.string <|
"should_term_"
++ String.fromInt (i + 1)
)
]
)
]
)
]
)
[ ( "option_name", 1 )
, ( "option_name_query", 1 )
]
should_queries =
[]
|> List.append (should_term 10000)
|> List.append (should_terms 1000)
|> List.append (should_match_bool_prefix 100)
|> List.append (should_match 10)
in
Search.makeRequest
makeRequestBody
(Search.makeRequestBody query from size "option" "option_name_query" should_queries)
("latest-" ++ String.fromInt options.mappingSchemaVersion ++ "-" ++ channel)
decodeResultItemSource
options

View file

@ -362,12 +362,14 @@ viewResultItemDetails channel item =
-- API
makeRequestBody :
String
makeRequest :
Search.Options
-> String
-> String
-> Int
-> Int
-> Http.Body
makeRequestBody queryRaw from size =
-> Cmd Msg
makeRequest options channel queryRaw from size =
let
query =
queryRaw
@ -481,130 +483,15 @@ makeRequestBody queryRaw from size =
, ( "package_pname", 1 )
]
filter_packages =
( "term"
, Json.Encode.object
[ ( "type"
, Json.Encode.object
[ ( "value", Json.Encode.string "package" )
, ( "_name", Json.Encode.string "filter_packages" )
]
)
]
)
filter_queries =
let
filterQuery =
query
|> String.replace "." " "
in
filterQuery
|> String.words
|> List.indexedMap
(\i query_word ->
let
isLast =
List.length (String.words filterQuery) == i + 1
in
[ if isLast then
( "bool"
, Json.Encode.object
[ ( "should"
, Json.Encode.list Json.Encode.object
[ [ ( "match"
, Json.Encode.object
[ ( "package_attr_name_query"
, Json.Encode.object
[ ( "query", Json.Encode.string query_word )
, ( "fuzziness", Json.Encode.string "1" )
, ( "_name", Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_should_match" )
]
)
]
)
]
, [ ( "match_bool_prefix"
, Json.Encode.object
[ ( "package_attr_name_query"
, Json.Encode.object
[ ( "query", Json.Encode.string query_word )
, ( "_name"
, Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_should_prefix"
)
]
)
]
)
]
]
)
]
)
else
( "match_bool_prefix"
, Json.Encode.object
[ ( "package_attr_name_query"
, Json.Encode.object
[ ( "query", Json.Encode.string query_word )
, ( "_name"
, Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_prefix"
)
]
)
]
)
]
)
should_queries =
[]
|> List.append (should_term 10000)
|> List.append (should_terms 1000)
|> List.append (should_match_bool_prefix 100)
|> List.append (should_match 10)
in
Http.jsonBody
(Json.Encode.object
[ ( "from"
, Json.Encode.int from
)
, ( "size"
, Json.Encode.int size
)
, ( "query"
, Json.Encode.object
[ ( "bool"
, Json.Encode.object
[ ( "filter"
, Json.Encode.list Json.Encode.object
(List.append
[ [ filter_packages ] ]
filter_queries
)
)
, ( "should"
, Json.Encode.list
Json.Encode.object
([]
|> List.append (should_term 10000)
|> List.append (should_terms 1000)
|> List.append (should_match_bool_prefix 100)
|> List.append (should_match 10)
)
)
]
)
]
)
]
)
makeRequest :
Search.Options
-> String
-> String
-> Int
-> Int
-> Cmd Msg
makeRequest options channel query from size =
Search.makeRequest
makeRequestBody
(Search.makeRequestBody query from size "package" "package_attr_name_query" should_queries)
("latest-" ++ String.fromInt options.mappingSchemaVersion ++ "-" ++ channel)
decodeResultItemSource
options

View file

@ -8,6 +8,7 @@ module Search exposing
, decodeResult
, init
, makeRequest
, makeRequestBody
, update
, view
)
@ -559,8 +560,161 @@ type alias Options =
}
filter_by_type :
String
-> ( String, Json.Encode.Value )
filter_by_type type_ =
( "term"
, Json.Encode.object
[ ( "type"
, Json.Encode.object
[ ( "value", Json.Encode.string type_ )
, ( "_name", Json.Encode.string <| "filter_" ++ type_ ++ "s" )
]
)
]
)
filter_by_query : String -> String -> List (List ( String, Json.Encode.Value ))
filter_by_query field queryRaw =
let
query =
queryRaw
|> String.trim
in
query
|> String.replace "." " "
|> String.words
|> List.indexedMap
(\i query_word ->
let
isLast =
List.length (String.words query) == i + 1
in
[ if isLast then
( "bool"
, Json.Encode.object
[ ( "should"
, Json.Encode.list Json.Encode.object
[ [ ( "match"
, Json.Encode.object
[ ( field
, Json.Encode.object
[ ( "query", Json.Encode.string query_word )
, ( "fuzziness", Json.Encode.string "1" )
, ( "_name", Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_should_match" )
]
)
]
)
]
, [ ( "match_bool_prefix"
, Json.Encode.object
[ ( field
, Json.Encode.object
[ ( "query", Json.Encode.string query_word )
, ( "_name"
, Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_should_prefix"
)
]
)
]
)
]
]
)
]
)
else
( "match_bool_prefix"
, Json.Encode.object
[ ( field
, Json.Encode.object
[ ( "query", Json.Encode.string query_word )
, ( "_name"
, Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_prefix"
)
]
)
]
)
]
)
makeRequestBody :
String
-> Int
-> Int
-> String
-> String
-> List (List ( String, Json.Encode.Value ))
-> Http.Body
makeRequestBody query from size type_ query_field should_queries =
-- TODO: rescore how close the query is to the root of the name
-- |> List.append
-- ("""int i = 1;
-- for (token in doc['option_name.raw'][0].splitOnToken('.')) {
-- if (token == '"""
-- ++ query
-- ++ """') {
-- return 10000 - (i * 100);
-- }
-- i++;
-- }
-- return 10;
-- """
-- |> stringIn "source"
-- |> objectIn "script"
-- |> objectIn "script_score"
-- |> objectIn "function_score"
-- |> objectIn "rescore_query"
-- |> List.append ("total" |> stringIn "score_mode")
-- |> List.append ("total" |> stringIn "score_mode")
-- |> objectIn "query"
-- |> List.append [ ( "window_size", Json.Encode.int 1000 ) ]
-- |> objectIn "rescore"
-- )
-- |> List.append
-- [ ( "from", Json.Encode.int from )
-- , ( "size", Json.Encode.int size )
-- ]
-- |> Json.Encode.object
-- |> Http.jsonBody
Http.jsonBody
(Json.Encode.object
[ ( "from"
, Json.Encode.int from
)
, ( "size"
, Json.Encode.int size
)
, ( "query"
, Json.Encode.object
[ ( "bool"
, Json.Encode.object
[ ( "filter"
, Json.Encode.list Json.Encode.object
(List.append
[ [ filter_by_type type_ ] ]
(filter_by_query query_field query)
)
)
, ( "should"
, Json.Encode.list Json.Encode.object should_queries
)
]
)
]
)
]
)
makeRequest :
(String -> Int -> Int -> Http.Body)
Http.Body
-> String
-> Json.Decode.Decoder a
-> Options
@ -568,7 +722,7 @@ makeRequest :
-> Int
-> Int
-> Cmd (Msg a)
makeRequest makeRequestBody index decodeResultItemSource options query from sizeRaw =
makeRequest body index decodeResultItemSource options query from sizeRaw =
let
-- you can not request more then 10000 results otherwise it will return 404
size =
@ -584,7 +738,7 @@ makeRequest makeRequestBody index decodeResultItemSource options query from size
[ Http.header "Authorization" ("Basic " ++ Base64.encode (options.username ++ ":" ++ options.password))
]
, url = options.url ++ "/" ++ index ++ "/_search"
, body = makeRequestBody query from size
, body = body
, expect =
Http.expectJson
(RemoteData.fromResult >> QueryResponse)

View file

@ -4,9 +4,10 @@ require("./index.scss");
const {Elm} = require('./Main');
console.log("WORKS: " + process.env.ELASTICSEARCH_MAPPING_SCHEMA_VERSION);
Elm.Main.init({
flags: {
elasticsearchMappingSchemaVersion: process.env.ELASTICSEARCH_MAPPING_SCHEMA_VERSION || 0,
elasticsearchMappingSchemaVersion: parseInt(process.env.ELASTICSEARCH_MAPPING_SCHEMA_VERSION) || 0,
elasticsearchUrl: process.env.ELASTICSEARCH_URL || 'https://nixos-search-5886075189.us-east-1.bonsaisearch.net:443',
elasticsearchUsername : process.env.ELASTICSEARCH_USERNAME || 'z3ZFJ6y2mR',
elasticsearchPassword : process.env.ELASTICSEARCH_PASSWORD || 'ds8CEvALPf9pui7XG'

View file

@ -26,6 +26,9 @@ var common = {
filename: MODE == "production" ? "[name]-[hash].js" : "index.js"
},
plugins: [
new webpack.EnvironmentPlugin([
"ELASTICSEARCH_MAPPING_SCHEMA_VERSION"
]),
new HTMLWebpackPlugin({
// Use this template to get basic responsive meta tags
template: "src/index.html",