improve packages search query (#102)

2020-06-18 12:24:52 +02:00 · 2020-06-18 12:24:52 +02:00 · c420d05815
parent 0fb5f699b9
commit c420d05815
8 changed files with 458 additions and 157 deletions
--- a/.gitignore
+++ b/.gitignore
@ -31,3 +31,4 @@ dist
 package-lock.json
 result
 scripts/eval-*
 eval-*
--- a/README.md
+++ b/README.md
@ -18,6 +18,24 @@ For backend we are using Elasticsearch instance which is kindly sponsored by
 [Elm](https://elm-lang.org).
 ## How search works?
 The use case we want to solve is that a visitor want to see if a package
 exists or to look up certain package's details.
 A user wants to converge to a single result if possible. The more characters
 are added to a search query the more narrow is search is and we should show
 less results.
 Very important is also ranking of search results. This will bring more relevant
 search results to the top, since a lot of times it is hard to produce search
 query that will output only one result item.
 A less important, but providing better user experience. are suggestions for
 writing better search query. Suggesting feature should guide user to write
 better queries which in turn will produce better results.
 ## Ideas we want to explore
 Apart from searching packages and options we would like to:
--- a/elm.json
+++ b/elm.json
@ -12,6 +12,7 @@
            "elm/html": "1.0.0",
            "elm/http": "2.0.0",
            "elm/json": "1.1.3",
            "elm/regex": "1.0.0",
            "elm/url": "1.0.0",
            "hecrj/html-parser": "2.3.4",
            "krisajenkins/remotedata": "6.0.1",
@ -21,7 +22,6 @@
            "elm/bytes": "1.0.8",
            "elm/file": "1.0.5",
            "elm/parser": "1.1.0",
            "elm/regex": "1.0.0",
            "elm/time": "1.0.0",
            "elm/virtual-dom": "1.0.2",
            "rtfeldman/elm-hex": "1.0.0"
--- a/scripts/import-channel
+++ b/scripts/import-channel
@ -13,7 +13,6 @@
 import boto3
 import botocore
 import botocore.client
 import xml.etree.ElementTree
 import click
 import click_log
 import elasticsearch
@ -22,10 +21,12 @@ import json
 import logging
 import os.path
 import pypandoc
 import re
 import requests
 import shlex
 import subprocess
 import tqdm
 import xml.etree.ElementTree
 logger = logging.getLogger("import-channel")
 click_log.basic_config(logger)
@ -33,7 +34,7 @@ click_log.basic_config(logger)
 S3_BUCKET = "nix-releases"
 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
-INDEX_SCHEMA_VERSION = 5
+INDEX_SCHEMA_VERSION = 6
 CHANNELS = {
    "unstable": {
        "packages": "nixpkgs/nixpkgs-20.09pre",
@ -49,11 +50,18 @@ CHANNELS = {
    },
 }
 ANALYSIS = {
-    "analyzer": {
+    "normalizer": {
-        "nixAttrName": {
+        "lowercase": {
            "type": "custom",
-            "tokenizer": "nix_attrname",
+            "char_filter": [],
-            "filter": ["lowercase", "nix_stopwords"],
+            "filter": ["lowercase"],
        }
    },
    "analyzer": {
        "lowercase": {
            "type": "custom",
            "tokenizer": "keyword",
            "filter": ["lowercase"],
        },
        "nixOptionName": {
            "type": "custom",
@ -67,13 +75,24 @@ ANALYSIS = {
        },
    },
    "tokenizer": {
-        "nix_attrname": {
+        "nix_package_query": {
            "type": "pattern",
            "pattern": "|".join(
                [
                    "[ ]",
                ]
            ),
        },
        "nix_package_attr_name": {
            "type": "pattern",
            # Split on attrname separators like _, .
            "pattern": "|".join(
                [
                    "[_.-]",  # Common separators like underscores, dots and dashes
                    "\\d+?Packages",  # python37Packages -> python
                    "\\d+?Plugins",  # vimPlugins -> vim
                    "\\d+?Extensions",  # php74Extensions -> php
                    "\\d+?Interpreters",  # perlInterpreters -> perl
                    # Camelcase tokenizer adapted from
                    # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html
                    "".join(
@ -118,7 +137,18 @@ ANALYSIS = {
        "nix_stopwords": {
            "type": "stop",
            "ignore_case": True,
-            "stopwords": ["packages", "package", "options", "option"],
+            "stopwords": [
                "packages",
                "package",
                "options",
                "option",
                "plugins",
                "plugin",
                "extensions",
                "extension",
                "interpreters",
                "interpreter",
            ],
        },
    },
 }
@ -146,12 +176,21 @@ MAPPING = {
            },
        },
        "package_attr_name": {
-            "type": "text",
+            "type": "keyword",
-            "analyzer": "nixAttrName",
+            "normalizer": "lowercase",
-            "fields": {"raw": {"type": "keyword"}},
+        },
        "package_attr_name_query": {
            "type": "keyword",
            "normalizer": "lowercase",
        },
        "package_attr_set": {
            "type": "keyword",
            "normalizer": "lowercase",
        },
        "package_pname": {
            "type": "keyword",
            "normalizer": "lowercase",
        },
        "package_attr_set": {"type": "keyword"},
        "package_pname": {"type": "keyword"},
        "package_pversion": {"type": "keyword"},
        "package_description": {"type": "text"},
        "package_longDescription": {"type": "text"},
@ -195,6 +234,39 @@ MAPPING = {
 }
 def split_query(text):
    """Tokenize package attr_name
    Example:
    python37Packages.test_name-test
     = index: 0
     - python37Packages.test1_name-test2
     - python37Packages.test1_name
     - python37Packages.test1
     - python37
     - python
     = index: 1
     - test1_name-test2
     - test1_name
     - test1
     = index: 2
     - name-test2
     - name
     = index: 3
     - test2
    """
    tokens = []
    regex = re.compile(".+?(?:(?<=[a-z])(?=[1-9A-Z])|(?<=[1-9A-Z])(?=[A-Z][a-z])|[\._-]|$)")
    parts =  [m.group(0) for m in regex.finditer(text)]
    for index in range(len(parts)):
        prev_parts = ""
        for part in parts[index:]:
            tokens.append((prev_parts + part).rstrip("_.-"))
            prev_parts += part
    return tokens
 def get_last_evaluation(prefix):
    logger.debug(f"Retriving last evaluation for {prefix} prefix.")
@ -265,6 +337,63 @@ def get_evaluation_builds(evaluation_id):
    return result
 def get_maintainer(maintainer):
    maintainers = []
    if type(maintainer) == str:
        maintainers.append(dict(
            name=maintainer,
            email=None,
            github=None,
        ))
    elif type(maintainer) == dict:
        maintainers.append(dict(
            name=maintainer.get("name"),
            email=maintainer.get("email"),
            github=maintainer.get("github"),
        ))
    elif type(maintainer) == list:
        for item in maintainer:
            maintainers += get_maintainer(item)
    else:
        logger.error(f"maintainer  can not be recognized from: {maintainer}")
        sys.exit(1)
    return maintainers
 def remove_attr_set(name):
    # some package sets the prefix is included in pname
    sets = [
        # Packages
        "emscripten",
        "lua",
        "php",
        "pure",
        "python",
        "lisp",
        "perl",
        "ruby",
        # Plugins
        "elasticsearch",
        "graylog",
        "tmuxplugin"
        "vimplugin"
    ]
    # TODO: is this correct
    if any([name.startswith(i) for i in sets]):
        name = "-".join(name.split("-")[1:])
    # node does things a bit different
    elif name.startswith("node_"):
        name = name[len("node_"):]
    return name
 def get_packages(evaluation, evaluation_builds):
    logger.debug(
        f"get_packages: Retriving list of packages for '{evaluation['git_revision']}' revision"
@ -281,6 +410,7 @@ def get_packages(evaluation, evaluation_builds):
    def gen():
        for attr_name, data in packages:
            position = data["meta"].get("position")
            if position and position.startswith("/nix/store"):
                position = position[44:]
@ -300,16 +430,7 @@ def get_packages(evaluation, evaluation_builds):
            else:
                licenses = []
-            maintainers = [
+            maintainers = get_maintainer(data["meta"].get("maintainers", []))
                type(maintainer) == str
                and dict(name=maintainer, email=None, github=None)
                or dict(
                    name=maintainer.get("name"),
                    email=maintainer.get("email"),
                    github=maintainer.get("github"),
                )
                for maintainer in data["meta"].get("maintainers", [])
            ]
            platforms = [
                type(platform) == str and platform or None
@ -319,9 +440,9 @@ def get_packages(evaluation, evaluation_builds):
            attr_set = None
            if "." in attr_name:
                attr_set = attr_name.split(".")[0]
-                if not attr_set.endswith("Packages") and not attr_set.endswith(
+                if not attr_set.endswith("Packages") and \
-                    "Plugins"
+                        not attr_set.endswith("Plugins") and \
-                ):
+                        not attr_set.endswith("Extensions"):
                    attr_set = None
            hydra = None
@ -349,8 +470,9 @@ def get_packages(evaluation, evaluation_builds):
                type="package",
                package_hydra=hydra,
                package_attr_name=attr_name,
                package_attr_name_query=list(split_query(attr_name)),
                package_attr_set=attr_set,
-                package_pname=data["pname"],
+                package_pname=remove_attr_set(data["pname"]),
                package_pversion=data["version"],
                package_description=data["meta"].get("description"),
                package_longDescription=data["meta"].get("longDescription", ""),
@ -405,7 +527,7 @@ def get_options(evaluation):
                # we first check if there are some xml elements before using pypandoc
                # since pypandoc calls are quite slow
                root = xml.etree.ElementTree.fromstring(xml_description)
-                if len(root.find('para').getchildren()) > 0:
+                if len(list(root.find('para'))) > 0:
                    description = pypandoc.convert_text(
                        xml_description,
                        "html",
--- a/scripts/packages-config.nix
+++ b/scripts/packages-config.nix
@ -5,8 +5,38 @@
  # Enable recursion into attribute sets that nix-env normally doesn't look into
  # so that we can get a more complete picture of the available packages for the
  # purposes of the index.
-  packageOverrides = super: {
+  packageOverrides = super:
-    haskellPackages = super.recurseIntoAttrs super.haskellPackages;
+  let
-    rPackages = super.recurseIntoAttrs super.rPackages;
+    recurseIntoAttrs = sets:
-  };
+      super.lib.genAttrs
        (builtins.filter (set: builtins.hasAttr set super) sets)
        (set: super.recurseIntoAttrs (builtins.getAttr set super));
  in recurseIntoAttrs [
    "roundcubePlugins"
    "emscriptenfastcompPackages"
    "fdbPackages"
    "nodePackages_latest"
    "nodePackages"
    "platformioPackages"
    "haskellPackages"
    "idrisPackages"
    "sconsPackages"
    "gns3Packages"
    "quicklispPackagesClisp"
    "quicklispPackagesSBCL"
    "rPackages"
    "apacheHttpdPackages_2_4"
    "zabbix44"
    "zabbix40"
    "zabbix30"
    "fusePackages"
    "nvidiaPackages"
    "sourceHanPackages"
    "atomPackages"
    "emacs25Packages"
    "emacs26Packages"
    "steamPackages"
    "ut2004Packages"
    "zeroadPackages"
  ];
 }
--- a/src/Page/Packages.elm
+++ b/src/Page/Packages.elm
@ -19,6 +19,7 @@ import Html
        , dl
        , dt
        , li
        , p
        , table
        , tbody
        , td
@ -42,6 +43,7 @@ import Http
 import Json.Decode
 import Json.Decode.Pipeline
 import Json.Encode
 import Regex
 import Search
@ -186,13 +188,33 @@ viewResultItem channel show item =
            else
                []
    in
-    tr [ onClick (SearchMsg (Search.ShowDetails item.source.attr_name)) ]
+    []
-        [ td [] [ text item.source.attr_name ]
+        -- DEBUG: |> List.append
        -- DEBUG:     [ tr []
        -- DEBUG:         [ td [ colspan 4 ]
        -- DEBUG:             [ p [] [ text <| "score: " ++ String.fromFloat item.score ]
        -- DEBUG:             , p []
        -- DEBUG:                 [ text <|
        -- DEBUG:                     "matched queries: "
        -- DEBUG:                 , ul []
        -- DEBUG:                     (item.matched_queries
        -- DEBUG:                         |> Maybe.withDefault []
        -- DEBUG:                         |> List.sort
        -- DEBUG:                         |> List.map (\q -> li [] [ text q ])
        -- DEBUG:                     )
        -- DEBUG:                 ]
        -- DEBUG:             ]
        -- DEBUG:         ]
        -- DEBUG:     ]
        |> List.append
            (tr [ onClick (SearchMsg (Search.ShowDetails item.source.attr_name)) ]
                [ td [] [ text <| item.source.attr_name ]
                , td [] [ text item.source.pname ]
                , td [] [ text item.source.pversion ]
                , td [] [ text <| Maybe.withDefault "" item.source.description ]
                ]
                :: packageDetails
            )
 viewResultItemDetails :
@ -345,126 +367,232 @@ makeRequestBody :
    -> Int
    -> Int
    -> Http.Body
-makeRequestBody query from size =
+makeRequestBody queryRaw from size =
    -- Prefix Query
    --   example query for "python"
    -- {
    --   "from": 0,
    --   "size": 10,
    --   "query": {
    --     "bool": {
    --       "filter": {
    --         "match": {
    --           "type": "package"
    --         }
    --       },
    --       "must": {
    --         "bool": {
    --           "should": [
    --             {
    --               "multi_match": {
    --                 "query": "python",
    --                 "boost": 1,
    --                 "fields": [
    --                   "package_attr_name.raw",
    --                   "package_attr_name"
    --                 ],
    --                 "type": "most_fields"
    --               }
    --             },
    --             {
    --               "term": {
    --                 "type": {
    --                   "value": "package",
    --                   "boost": 0
    --                 }
    --               }
    --             },
    --             {
    --               "term": {
    --                 "package_pname": {
    --                   "value": "python",
    --                   "boost": 2
    --                 }
    --               }
    --             },
    --             {
    --               "term": {
    --                 "package_pversion": {
    --                   "value": "python",
    --                   "boost": 0.2
    --                 }
    --               }
    --             },
    --             {
    --               "term": {
    --                 "package_description": {
    --                   "value": "python",
    --                   "boost": 0.3
    --                 }
    --               }
    --             },
    --             {
    --               "term": {
    --                 "package_longDescription": {
    --                   "value": "python",
    --                   "boost": 0.1
    --                 }
    --               }
    --             }
    --           ]
    --         }
    --       }
    --     }
    --   }
    -- }
    let
-        listIn name type_ value =
+        query =
-            [ ( name, Json.Encode.list type_ value ) ]
+            queryRaw
                |> String.trim
-        objectIn name value =
+        delimiters =
-            [ ( name, Json.Encode.object value ) ]
+            Maybe.withDefault Regex.never (Regex.fromString "[. ]")
-        encodeTerm ( name, boost ) =
+        should_match boost_base =
-            [ ( "value", Json.Encode.string query )
+            List.indexedMap
-            , ( "boost", Json.Encode.float boost )
+                (\i ( field, boost ) ->
-            ]
+                    [ ( "match"
-                |> objectIn name
+                      , Json.Encode.object
-                |> objectIn "term"
+                            [ ( field
-    in
+                              , Json.Encode.object
    [ ( "package_pname", 2.0 )
    , ( "package_pversion", 0.2 )
    , ( "package_description", 0.3 )
    , ( "package_longDescription", 0.1 )
    ]
        |> List.map encodeTerm
        |> List.append
            [ [ "package_attr_name.raw"
              , "package_attr_name"
              ]
                |> listIn "fields" Json.Encode.string
                |> List.append
                                    [ ( "query", Json.Encode.string query )
-                    , ( "boost", Json.Encode.float 1.0 )
+                                    , ( "boost", Json.Encode.float boost )
-                    ]
+                                    , ( "analyzer", Json.Encode.string "whitespace" )
-                |> objectIn "multi_match"
+                                    , ( "fuzziness", Json.Encode.string "1" )
-            ]
+                                    , ( "_name"
-        |> listIn "should" Json.Encode.object
+                                      , Json.Encode.string <|
-        |> objectIn "bool"
+                                            "should_match_"
-        |> objectIn "must"
+                                                ++ String.fromInt (i + 1)
        |> ([ ( "type", Json.Encode.string "package" ) ]
                |> objectIn "match"
                |> objectIn "filter"
                |> List.append
                                      )
        |> objectIn "bool"
        |> objectIn "query"
        |> List.append
            [ ( "from", Json.Encode.int from )
            , ( "size", Json.Encode.int size )
                                    ]
-        |> Json.Encode.object
+                              )
-        |> Http.jsonBody
+                            ]
                      )
                    ]
                )
                [ ( "package_attr_name", 1 )
                , ( "package_attr_name_query", 1 )
                , ( "package_pname", 1 )
                , ( "package_description", 1 )
                , ( "package_longDescription", 1 )
                ]
        should_match_bool_prefix boost_base =
            List.indexedMap
                (\i ( field, boost ) ->
                    [ ( "match_bool_prefix"
                      , Json.Encode.object
                            [ ( field
                              , Json.Encode.object
                                    [ ( "query", Json.Encode.string query )
                                    , ( "boost", Json.Encode.float boost )
                                    , ( "analyzer", Json.Encode.string "whitespace" )
                                    , ( "fuzziness", Json.Encode.string "1" )
                                    , ( "_name"
                                      , Json.Encode.string <|
                                            "should_match_bool_prefix_"
                                                ++ String.fromInt (i + 1)
                                      )
                                    ]
                              )
                            ]
                      )
                    ]
                )
                [ ( "package_attr_name", 1 )
                , ( "package_attr_name_query", 1 )
                , ( "package_pname", 1 )
                ]
        should_terms boost_base =
            List.indexedMap
                (\i ( field, boost ) ->
                    [ ( "terms"
                      , Json.Encode.object
                            [ ( field
                              , Json.Encode.list Json.Encode.string (Regex.split delimiters query)
                              )
                            , ( "boost", Json.Encode.float <| boost_base * boost )
                            , ( "_name"
                              , Json.Encode.string <|
                                    "should_terms_"
                                        ++ String.fromInt (i + 1)
                              )
                            ]
                      )
                    ]
                )
                [ ( "package_attr_name", 1 )
                , ( "package_attr_name_query", 1 )
                , ( "package_pname", 1 )
                , ( "package_attr_set", 1 )
                ]
        should_term boost_base =
            List.indexedMap
                (\i ( field, boost ) ->
                    [ ( "term"
                      , Json.Encode.object
                            [ ( field
                              , Json.Encode.object
                                    [ ( "value", Json.Encode.string query )
                                    , ( "boost", Json.Encode.float <| boost_base * boost )
                                    , ( "_name"
                                      , Json.Encode.string <|
                                            "should_term_"
                                                ++ String.fromInt (i + 1)
                                      )
                                    ]
                              )
                            ]
                      )
                    ]
                )
                [ ( "package_attr_name", 1 )
                , ( "package_attr_name_query", 1 )
                , ( "package_pname", 1 )
                ]
        filter_packages =
            ( "term"
            , Json.Encode.object
                [ ( "type"
                  , Json.Encode.object
                        [ ( "value", Json.Encode.string "package" )
                        , ( "_name", Json.Encode.string "filter_packages" )
                        ]
                  )
                ]
            )
        filter_queries =
            let
                filterQuery =
                    query
                        |> String.replace "." " "
            in
            filterQuery
                |> String.words
                |> List.indexedMap
                    (\i query_word ->
                        let
                            isLast =
                                List.length (String.words filterQuery) == i + 1
                        in
                        [ if isLast then
                            ( "bool"
                            , Json.Encode.object
                                [ ( "should"
                                  , Json.Encode.list Json.Encode.object
                                        [ [ ( "match"
                                            , Json.Encode.object
                                                [ ( "package_attr_name_query"
                                                  , Json.Encode.object
                                                        [ ( "query", Json.Encode.string query_word )
                                                        , ( "fuzziness", Json.Encode.string "1" )
                                                        , ( "_name", Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_should_match" )
                                                        ]
                                                  )
                                                ]
                                            )
                                          ]
                                        , [ ( "match_bool_prefix"
                                            , Json.Encode.object
                                                [ ( "package_attr_name_query"
                                                  , Json.Encode.object
                                                        [ ( "query", Json.Encode.string query_word )
                                                        , ( "_name"
                                                          , Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_should_prefix"
                                                          )
                                                        ]
                                                  )
                                                ]
                                            )
                                          ]
                                        ]
                                  )
                                ]
                            )
                          else
                            ( "match_bool_prefix"
                            , Json.Encode.object
                                [ ( "package_attr_name_query"
                                  , Json.Encode.object
                                        [ ( "query", Json.Encode.string query_word )
                                        , ( "_name"
                                          , Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_prefix"
                                          )
                                        ]
                                  )
                                ]
                            )
                        ]
                    )
    in
    Http.jsonBody
        (Json.Encode.object
            [ ( "from"
              , Json.Encode.int from
              )
            , ( "size"
              , Json.Encode.int size
              )
            , ( "query"
              , Json.Encode.object
                    [ ( "bool"
                      , Json.Encode.object
                            [ ( "filter"
                              , Json.Encode.list Json.Encode.object
                                    (List.append
                                        [ [ filter_packages ] ]
                                        filter_queries
                                    )
                              )
                            , ( "should"
                              , Json.Encode.list
                                    Json.Encode.object
                                    ([]
                                        |> List.append (should_term 10000)
                                        |> List.append (should_terms 1000)
                                        |> List.append (should_match_bool_prefix 100)
                                        |> List.append (should_match 10)
                                    )
                              )
                            ]
                      )
                    ]
              )
            ]
        )
 makeRequest :
--- a/src/Search.elm
+++ b/src/Search.elm
@ -91,6 +91,7 @@ type alias ResultItem a =
    , id : String
    , score : Float
    , source : a
    , matched_queries : Maybe (List String)
    }
@ -622,8 +623,9 @@ decodeResultHitsTotal =
 decodeResultItem : Json.Decode.Decoder a -> Json.Decode.Decoder (ResultItem a)
 decodeResultItem decodeResultItemSource =
-    Json.Decode.map4 ResultItem
+    Json.Decode.map5 ResultItem
        (Json.Decode.field "_index" Json.Decode.string)
        (Json.Decode.field "_id" Json.Decode.string)
        (Json.Decode.field "_score" Json.Decode.float)
        (Json.Decode.field "_source" decodeResultItemSource)
        (Json.Decode.maybe (Json.Decode.field "matched_queries" (Json.Decode.list Json.Decode.string)))
--- a/src/index.js
+++ b/src/index.js
@ -6,7 +6,7 @@ const {Elm} = require('./Main');
 Elm.Main.init({
  flags: {
-    elasticsearchMappingSchemaVersion: process.env.ELASTICSEARCH_MAPPING_SCHEMA_VERSION || 5,
+    elasticsearchMappingSchemaVersion: process.env.ELASTICSEARCH_MAPPING_SCHEMA_VERSION || 6,
    elasticsearchUrl: process.env.ELASTICSEARCH_URL || 'https://nixos-search-5886075189.us-east-1.bonsaisearch.net:443',
    elasticsearchUsername : process.env.ELASTICSEARCH_USERNAME || 'z3ZFJ6y2mR',
    elasticsearchPassword : process.env.ELASTICSEARCH_PASSWORD || 'ds8CEvALPf9pui7XG'