From 3ecce08a94e33e6387c2d147d50da6ab06d3520b Mon Sep 17 00:00:00 2001 From: Rok Garbas Date: Fri, 19 Jun 2020 08:53:49 +0200 Subject: [PATCH] Improve option search ranking (#107) --- VERSION | 2 +- import-scripts/import_scripts/channel.py | 92 +------ src/Page/Home.elm | 26 +- src/Page/Options.elm | 304 +++++++++++------------ src/Page/Packages.elm | 139 +---------- src/Search.elm | 160 +++++++++++- src/index.js | 3 +- webpack.config.js | 3 + 8 files changed, 343 insertions(+), 386 deletions(-) diff --git a/VERSION b/VERSION index 7f8f011..45a4fb7 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -7 +8 diff --git a/import-scripts/import_scripts/channel.py b/import-scripts/import_scripts/channel.py index 8618d4d..51f064e 100644 --- a/import-scripts/import_scripts/channel.py +++ b/import-scripts/import_scripts/channel.py @@ -49,83 +49,6 @@ ANALYSIS = { "tokenizer": "keyword", "filter": ["lowercase"], }, - "nixOptionName": { - "type": "custom", - "tokenizer": "nix_option_name", - "filter": ["lowercase"], - }, - "nixOptionNameGranular": { - "type": "custom", - "tokenizer": "nix_option_name_granular", - "filter": ["lowercase"], - }, - }, - "tokenizer": { - "nix_package_query": {"type": "pattern", "pattern": "|".join(["[ ]"])}, - "nix_package_attr_name": { - "type": "pattern", - # Split on attrname separators like _, . - "pattern": "|".join( - [ - "[_.-]", # Common separators like underscores, dots and dashes - "\\d+?Packages", # python37Packages -> python - "\\d+?Plugins", # vimPlugins -> vim - "\\d+?Extensions", # php74Extensions -> php - "\\d+?Interpreters", # perlInterpreters -> perl - # Camelcase tokenizer adapted from - # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html - "".join( - [ - "(?<=[\\p{L}&&[^\\p{Lu}]])" # lower case - "(?=\\p{Lu})", # followed by upper case - "|", - "(?<=\\p{Lu})" # or upper case - "(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])", # followed by lower case - ] - ), - ] - ), - }, - "nix_option_name": {"type": "pattern", "pattern": "[.]"}, - # Lower priority (virtualHost -> [virtual, host]) - "nix_option_name_granular": { - "type": "pattern", - # Split on attrname separators like _, . - "pattern": "|".join( - [ - "[_.-]", # Common separators like underscores, dots and dashes - # Camelcase tokenizer adapted from - # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html - "".join( - [ - "(?<=[\\p{L}&&[^\\p{Lu}]])" # lower case - "(?=\\p{Lu})", # followed by upper case - "|", - "(?<=\\p{Lu})" # or upper case - "(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])", # followed by lower case - ] - ), - ] - ), - }, - }, - "filter": { - "nix_stopwords": { - "type": "stop", - "ignore_case": True, - "stopwords": [ - "packages", - "package", - "options", - "option", - "plugins", - "plugin", - "extensions", - "extension", - "interpreters", - "interpreter", - ], - }, }, } MAPPING = { @@ -175,15 +98,8 @@ MAPPING = { "package_homepage": {"type": "keyword"}, "package_system": {"type": "keyword"}, # Options fields - "option_name": { - "type": "text", - "analyzer": "nixOptionName", - "fielddata": True, - "fields": { - "raw": {"type": "keyword"}, - "granular": {"type": "text", "analyzer": "nixOptionNameGranular"}, - }, - }, + "option_name": {"type": "keyword", "normalizer": "lowercase"}, + "option_name_query": {"type": "keyword", "normalizer": "lowercase"}, "option_description": {"type": "text"}, "option_type": {"type": "keyword"}, "option_default": {"type": "text"}, @@ -340,7 +256,8 @@ def remove_attr_set(name): # Plugins "elasticsearch", "graylog", - "tmuxplugin" "vimplugin", + "tmuxplugin", + "vimplugin", ] # TODO: is this correct if any([name.startswith(i) for i in sets]): @@ -495,6 +412,7 @@ def get_options(evaluation): yield dict( type="option", option_name=name, + option_name_query=split_query(name), option_description=description, option_type=option.get("type"), option_default=default, diff --git a/src/Page/Home.elm b/src/Page/Home.elm index 42b5520..4b83df7 100644 --- a/src/Page/Home.elm +++ b/src/Page/Home.elm @@ -1,28 +1,38 @@ module Page.Home exposing (Model, Msg, init, update, view) -import Html exposing (Html, text, div ) +import Html exposing (Html, div, text) + + -- MODEL -type alias Model = () + +type alias Model = + () -init : (Model, Cmd Msg) +init : ( Model, Cmd Msg ) init = - ((), Cmd.none) + ( (), Cmd.none ) + -- UPDATE -type Msg = NoOp + +type Msg + = NoOp + update : Msg -> Model -> ( Model, Cmd Msg ) update msg model = - (model, Cmd.none) + ( model, Cmd.none ) + + -- VIEW + view : Model -> Html Msg view model = - div [] [text "Home"] - + div [] [ text "Home" ] diff --git a/src/Page/Options.elm b/src/Page/Options.elm index dca2809..e7ac584 100644 --- a/src/Page/Options.elm +++ b/src/Page/Options.elm @@ -17,6 +17,8 @@ import Html , div , dl , dt + , li + , p , pre , span , table @@ -26,6 +28,7 @@ import Html , th , thead , tr + , ul ) import Html.Attributes exposing @@ -42,6 +45,7 @@ import Html.Parser.Util import Http import Json.Decode import Json.Encode +import Regex import Search @@ -145,10 +149,30 @@ viewResultItem show item = else [] in - tr [ onClick (SearchMsg (Search.ShowDetails item.source.name)) ] - [ td [] [ text item.source.name ] - ] - :: packageDetails + [] + -- DEBUG: |> List.append + -- DEBUG: [ tr [] + -- DEBUG: [ td [ colspan 1 ] + -- DEBUG: [ p [] [ text <| "score: " ++ String.fromFloat item.score ] + -- DEBUG: , p [] + -- DEBUG: [ text <| + -- DEBUG: "matched queries: " + -- DEBUG: , ul [] + -- DEBUG: (item.matched_queries + -- DEBUG: |> Maybe.withDefault [] + -- DEBUG: |> List.sort + -- DEBUG: |> List.map (\q -> li [] [ text q ]) + -- DEBUG: ) + -- DEBUG: ] + -- DEBUG: ] + -- DEBUG: ] + -- DEBUG: ] + |> List.append + (tr [ onClick (SearchMsg (Search.ShowDetails item.source.name)) ] + [ td [] [ text item.source.name ] + ] + :: packageDetails + ) viewResultItemDetails : @@ -229,160 +253,6 @@ viewResultItemDetails item = -- API -makeRequestBody : - String - -> Int - -> Int - -> Http.Body -makeRequestBody query from size = - -- Prefix Query - -- example query for "python" - -- { - -- "from": 0, - -- "size": 1000, - -- "query": { - -- "bool": { - -- "must": { - -- "bool": { - -- "should": [ - -- { - -- "term": { - -- "option_name.raw": { - -- "value": "nginx", - -- "boost": 2.0 - -- } - -- } - -- }, - -- { - -- "term": { - -- "option_name": { - -- "value": "nginx", - -- "boost": 1.0 - -- } - -- } - -- }, - -- { - -- "term": { - -- "option_name.granular": { - -- "value": "nginx", - -- "boost": 0.6 - -- } - -- } - -- }, - -- { - -- "term": { - -- "option_description": { - -- "value": "nginx", - -- "boost": 0.3 - -- } - -- } - -- } - -- ] - -- } - -- }, - -- "filter": [ - -- { - -- "match": { - -- "type": "option" - -- } - -- } - -- ] - -- } - -- }, - -- "rescore" : { - -- "window_size": 500, - -- "query" : { - -- "score_mode": "total", - -- "rescore_query" : { - -- "function_score" : { - -- "script_score": { - -- "script": { - -- "source": " - -- int i = 1; - -- for (token in doc['option_name.raw'][0].splitOnToken('.')) { - -- if (token == \"nginx\") { - -- return 10000 - (i * 100); - -- } - -- i++; - -- } - -- return 10; - -- " - -- } - -- } - -- } - -- } - -- } - -- } - -- } - let - stringIn name value = - [ ( name, Json.Encode.string value ) ] - - listIn name type_ value = - [ ( name, Json.Encode.list type_ value ) ] - - objectIn name value = - [ ( name, Json.Encode.object value ) ] - - encodeTerm ( name, boost ) = - [ ( "term" - , Json.Encode.object - [ ( name - , Json.Encode.object - [ ( "value", Json.Encode.string query ) - , ( "boost", Json.Encode.float boost ) - ] - ) - ] - ) - ] - in - [ ( "option_name.raw", 2.0 ) - , ( "option_name", 1.0 ) - , ( "option_name.granular", 0.6 ) - , ( "option_description", 0.3 ) - ] - |> List.map encodeTerm - |> listIn "should" Json.Encode.object - |> objectIn "bool" - |> objectIn "must" - |> ([ ( "type", Json.Encode.string "option" ) ] - |> objectIn "match" - |> objectIn "filter" - |> List.append - ) - |> objectIn "bool" - |> objectIn "query" - |> List.append - ("""int i = 1; - for (token in doc['option_name.raw'][0].splitOnToken('.')) { - if (token == '""" - ++ query - ++ """') { - return 10000 - (i * 100); - } - i++; - } - return 10; - """ - |> stringIn "source" - |> objectIn "script" - |> objectIn "script_score" - |> objectIn "function_score" - |> objectIn "rescore_query" - |> List.append ("total" |> stringIn "score_mode") - |> objectIn "query" - |> List.append [ ( "window_size", Json.Encode.int 1000 ) ] - |> objectIn "rescore" - ) - |> List.append - [ ( "from", Json.Encode.int from ) - , ( "size", Json.Encode.int size ) - ] - |> Json.Encode.object - |> Http.jsonBody - - makeRequest : Search.Options -> String @@ -390,9 +260,123 @@ makeRequest : -> Int -> Int -> Cmd Msg -makeRequest options channel query from size = +makeRequest options channel queryRaw from size = + let + query = + queryRaw + |> String.trim + + delimiters = + Maybe.withDefault Regex.never (Regex.fromString "[. ]") + + should_match boost_base = + List.indexedMap + (\i ( field, boost ) -> + [ ( "match" + , Json.Encode.object + [ ( field + , Json.Encode.object + [ ( "query", Json.Encode.string query ) + , ( "boost", Json.Encode.float boost ) + , ( "analyzer", Json.Encode.string "whitespace" ) + , ( "fuzziness", Json.Encode.string "1" ) + , ( "_name" + , Json.Encode.string <| + "should_match_" + ++ String.fromInt (i + 1) + ) + ] + ) + ] + ) + ] + ) + [ ( "option_name", 1 ) + , ( "option_name_query", 1 ) + , ( "option_description", 1 ) + ] + + should_match_bool_prefix boost_base = + List.indexedMap + (\i ( field, boost ) -> + [ ( "match_bool_prefix" + , Json.Encode.object + [ ( field + , Json.Encode.object + [ ( "query", Json.Encode.string query ) + , ( "boost", Json.Encode.float boost ) + , ( "analyzer", Json.Encode.string "whitespace" ) + , ( "fuzziness", Json.Encode.string "1" ) + , ( "_name" + , Json.Encode.string <| + "should_match_bool_prefix_" + ++ String.fromInt (i + 1) + ) + ] + ) + ] + ) + ] + ) + [ ( "option_name", 1 ) + , ( "option_name_query", 1 ) + ] + + should_terms boost_base = + List.indexedMap + (\i ( field, boost ) -> + [ ( "terms" + , Json.Encode.object + [ ( field + , Json.Encode.list Json.Encode.string (Regex.split delimiters query) + ) + , ( "boost", Json.Encode.float <| boost_base * boost ) + , ( "_name" + , Json.Encode.string <| + "should_terms_" + ++ String.fromInt (i + 1) + ) + ] + ) + ] + ) + [ ( "option_name", 1 ) + , ( "option_name_query", 1 ) + ] + + should_term boost_base = + List.indexedMap + (\i ( field, boost ) -> + [ ( "term" + , Json.Encode.object + [ ( field + , Json.Encode.object + [ ( "value", Json.Encode.string query ) + , ( "boost", Json.Encode.float <| boost_base * boost ) + , ( "_name" + , Json.Encode.string <| + "should_term_" + ++ String.fromInt (i + 1) + ) + ] + ) + ] + ) + ] + ) + [ ( "option_name", 1 ) + , ( "option_name_query", 1 ) + ] + + should_queries = + [] + |> List.append (should_term 10000) + |> List.append (should_terms 1000) + |> List.append (should_match_bool_prefix 100) + |> List.append (should_match 10) + in Search.makeRequest - makeRequestBody + (Search.makeRequestBody query from size "option" "option_name_query" should_queries) ("latest-" ++ String.fromInt options.mappingSchemaVersion ++ "-" ++ channel) decodeResultItemSource options diff --git a/src/Page/Packages.elm b/src/Page/Packages.elm index 93c665e..7dcc50b 100644 --- a/src/Page/Packages.elm +++ b/src/Page/Packages.elm @@ -362,12 +362,14 @@ viewResultItemDetails channel item = -- API -makeRequestBody : - String +makeRequest : + Search.Options + -> String + -> String -> Int -> Int - -> Http.Body -makeRequestBody queryRaw from size = + -> Cmd Msg +makeRequest options channel queryRaw from size = let query = queryRaw @@ -481,130 +483,15 @@ makeRequestBody queryRaw from size = , ( "package_pname", 1 ) ] - filter_packages = - ( "term" - , Json.Encode.object - [ ( "type" - , Json.Encode.object - [ ( "value", Json.Encode.string "package" ) - , ( "_name", Json.Encode.string "filter_packages" ) - ] - ) - ] - ) - - filter_queries = - let - filterQuery = - query - |> String.replace "." " " - in - filterQuery - |> String.words - |> List.indexedMap - (\i query_word -> - let - isLast = - List.length (String.words filterQuery) == i + 1 - in - [ if isLast then - ( "bool" - , Json.Encode.object - [ ( "should" - , Json.Encode.list Json.Encode.object - [ [ ( "match" - , Json.Encode.object - [ ( "package_attr_name_query" - , Json.Encode.object - [ ( "query", Json.Encode.string query_word ) - , ( "fuzziness", Json.Encode.string "1" ) - , ( "_name", Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_should_match" ) - ] - ) - ] - ) - ] - , [ ( "match_bool_prefix" - , Json.Encode.object - [ ( "package_attr_name_query" - , Json.Encode.object - [ ( "query", Json.Encode.string query_word ) - , ( "_name" - , Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_should_prefix" - ) - ] - ) - ] - ) - ] - ] - ) - ] - ) - - else - ( "match_bool_prefix" - , Json.Encode.object - [ ( "package_attr_name_query" - , Json.Encode.object - [ ( "query", Json.Encode.string query_word ) - , ( "_name" - , Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_prefix" - ) - ] - ) - ] - ) - ] - ) + should_queries = + [] + |> List.append (should_term 10000) + |> List.append (should_terms 1000) + |> List.append (should_match_bool_prefix 100) + |> List.append (should_match 10) in - Http.jsonBody - (Json.Encode.object - [ ( "from" - , Json.Encode.int from - ) - , ( "size" - , Json.Encode.int size - ) - , ( "query" - , Json.Encode.object - [ ( "bool" - , Json.Encode.object - [ ( "filter" - , Json.Encode.list Json.Encode.object - (List.append - [ [ filter_packages ] ] - filter_queries - ) - ) - , ( "should" - , Json.Encode.list - Json.Encode.object - ([] - |> List.append (should_term 10000) - |> List.append (should_terms 1000) - |> List.append (should_match_bool_prefix 100) - |> List.append (should_match 10) - ) - ) - ] - ) - ] - ) - ] - ) - - -makeRequest : - Search.Options - -> String - -> String - -> Int - -> Int - -> Cmd Msg -makeRequest options channel query from size = Search.makeRequest - makeRequestBody + (Search.makeRequestBody query from size "package" "package_attr_name_query" should_queries) ("latest-" ++ String.fromInt options.mappingSchemaVersion ++ "-" ++ channel) decodeResultItemSource options diff --git a/src/Search.elm b/src/Search.elm index ba446f0..7f4160c 100644 --- a/src/Search.elm +++ b/src/Search.elm @@ -8,6 +8,7 @@ module Search exposing , decodeResult , init , makeRequest + , makeRequestBody , update , view ) @@ -559,8 +560,161 @@ type alias Options = } +filter_by_type : + String + -> ( String, Json.Encode.Value ) +filter_by_type type_ = + ( "term" + , Json.Encode.object + [ ( "type" + , Json.Encode.object + [ ( "value", Json.Encode.string type_ ) + , ( "_name", Json.Encode.string <| "filter_" ++ type_ ++ "s" ) + ] + ) + ] + ) + + +filter_by_query : String -> String -> List (List ( String, Json.Encode.Value )) +filter_by_query field queryRaw = + let + query = + queryRaw + |> String.trim + in + query + |> String.replace "." " " + |> String.words + |> List.indexedMap + (\i query_word -> + let + isLast = + List.length (String.words query) == i + 1 + in + [ if isLast then + ( "bool" + , Json.Encode.object + [ ( "should" + , Json.Encode.list Json.Encode.object + [ [ ( "match" + , Json.Encode.object + [ ( field + , Json.Encode.object + [ ( "query", Json.Encode.string query_word ) + , ( "fuzziness", Json.Encode.string "1" ) + , ( "_name", Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_should_match" ) + ] + ) + ] + ) + ] + , [ ( "match_bool_prefix" + , Json.Encode.object + [ ( field + , Json.Encode.object + [ ( "query", Json.Encode.string query_word ) + , ( "_name" + , Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_should_prefix" + ) + ] + ) + ] + ) + ] + ] + ) + ] + ) + + else + ( "match_bool_prefix" + , Json.Encode.object + [ ( field + , Json.Encode.object + [ ( "query", Json.Encode.string query_word ) + , ( "_name" + , Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_prefix" + ) + ] + ) + ] + ) + ] + ) + + +makeRequestBody : + String + -> Int + -> Int + -> String + -> String + -> List (List ( String, Json.Encode.Value )) + -> Http.Body +makeRequestBody query from size type_ query_field should_queries = + -- TODO: rescore how close the query is to the root of the name + -- |> List.append + -- ("""int i = 1; + -- for (token in doc['option_name.raw'][0].splitOnToken('.')) { + -- if (token == '""" + -- ++ query + -- ++ """') { + -- return 10000 - (i * 100); + -- } + -- i++; + -- } + -- return 10; + -- """ + -- |> stringIn "source" + -- |> objectIn "script" + -- |> objectIn "script_score" + -- |> objectIn "function_score" + -- |> objectIn "rescore_query" + -- |> List.append ("total" |> stringIn "score_mode") + -- |> List.append ("total" |> stringIn "score_mode") + -- |> objectIn "query" + -- |> List.append [ ( "window_size", Json.Encode.int 1000 ) ] + -- |> objectIn "rescore" + -- ) + -- |> List.append + -- [ ( "from", Json.Encode.int from ) + -- , ( "size", Json.Encode.int size ) + -- ] + -- |> Json.Encode.object + -- |> Http.jsonBody + Http.jsonBody + (Json.Encode.object + [ ( "from" + , Json.Encode.int from + ) + , ( "size" + , Json.Encode.int size + ) + , ( "query" + , Json.Encode.object + [ ( "bool" + , Json.Encode.object + [ ( "filter" + , Json.Encode.list Json.Encode.object + (List.append + [ [ filter_by_type type_ ] ] + (filter_by_query query_field query) + ) + ) + , ( "should" + , Json.Encode.list Json.Encode.object should_queries + ) + ] + ) + ] + ) + ] + ) + + makeRequest : - (String -> Int -> Int -> Http.Body) + Http.Body -> String -> Json.Decode.Decoder a -> Options @@ -568,7 +722,7 @@ makeRequest : -> Int -> Int -> Cmd (Msg a) -makeRequest makeRequestBody index decodeResultItemSource options query from sizeRaw = +makeRequest body index decodeResultItemSource options query from sizeRaw = let -- you can not request more then 10000 results otherwise it will return 404 size = @@ -584,7 +738,7 @@ makeRequest makeRequestBody index decodeResultItemSource options query from size [ Http.header "Authorization" ("Basic " ++ Base64.encode (options.username ++ ":" ++ options.password)) ] , url = options.url ++ "/" ++ index ++ "/_search" - , body = makeRequestBody query from size + , body = body , expect = Http.expectJson (RemoteData.fromResult >> QueryResponse) diff --git a/src/index.js b/src/index.js index 58baa3c..c2aa0d1 100644 --- a/src/index.js +++ b/src/index.js @@ -4,9 +4,10 @@ require("./index.scss"); const {Elm} = require('./Main'); +console.log("WORKS: " + process.env.ELASTICSEARCH_MAPPING_SCHEMA_VERSION); Elm.Main.init({ flags: { - elasticsearchMappingSchemaVersion: process.env.ELASTICSEARCH_MAPPING_SCHEMA_VERSION || 0, + elasticsearchMappingSchemaVersion: parseInt(process.env.ELASTICSEARCH_MAPPING_SCHEMA_VERSION) || 0, elasticsearchUrl: process.env.ELASTICSEARCH_URL || 'https://nixos-search-5886075189.us-east-1.bonsaisearch.net:443', elasticsearchUsername : process.env.ELASTICSEARCH_USERNAME || 'z3ZFJ6y2mR', elasticsearchPassword : process.env.ELASTICSEARCH_PASSWORD || 'ds8CEvALPf9pui7XG' diff --git a/webpack.config.js b/webpack.config.js index e87f41c..a0a6a7a 100644 --- a/webpack.config.js +++ b/webpack.config.js @@ -26,6 +26,9 @@ var common = { filename: MODE == "production" ? "[name]-[hash].js" : "index.js" }, plugins: [ + new webpack.EnvironmentPlugin([ + "ELASTICSEARCH_MAPPING_SCHEMA_VERSION" + ]), new HTMLWebpackPlugin({ // Use this template to get basic responsive meta tags template: "src/index.html",