From c420d058156320d7bd80194cac1a0770fd10f10d Mon Sep 17 00:00:00 2001 From: Rok Garbas Date: Thu, 18 Jun 2020 12:24:52 +0200 Subject: [PATCH] improve packages search query (#102) --- .gitignore | 1 + README.md | 18 ++ elm.json | 2 +- scripts/import-channel | 178 ++++++++++++++--- scripts/packages-config.nix | 38 +++- src/Page/Packages.elm | 372 ++++++++++++++++++++++++------------ src/Search.elm | 4 +- src/index.js | 2 +- 8 files changed, 458 insertions(+), 157 deletions(-) diff --git a/.gitignore b/.gitignore index 3effa1f..a3070a2 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,4 @@ dist package-lock.json result scripts/eval-* +eval-* diff --git a/README.md b/README.md index 0799f1d..097e9df 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,24 @@ For backend we are using Elasticsearch instance which is kindly sponsored by [Elm](https://elm-lang.org). +## How search works? + +The use case we want to solve is that a visitor want to see if a package +exists or to look up certain package's details. + +A user wants to converge to a single result if possible. The more characters +are added to a search query the more narrow is search is and we should show +less results. + +Very important is also ranking of search results. This will bring more relevant +search results to the top, since a lot of times it is hard to produce search +query that will output only one result item. + +A less important, but providing better user experience. are suggestions for +writing better search query. Suggesting feature should guide user to write +better queries which in turn will produce better results. + + ## Ideas we want to explore Apart from searching packages and options we would like to: diff --git a/elm.json b/elm.json index 92ec5df..db9d1a3 100644 --- a/elm.json +++ b/elm.json @@ -12,6 +12,7 @@ "elm/html": "1.0.0", "elm/http": "2.0.0", "elm/json": "1.1.3", + "elm/regex": "1.0.0", "elm/url": "1.0.0", "hecrj/html-parser": "2.3.4", "krisajenkins/remotedata": "6.0.1", @@ -21,7 +22,6 @@ "elm/bytes": "1.0.8", "elm/file": "1.0.5", "elm/parser": "1.1.0", - "elm/regex": "1.0.0", "elm/time": "1.0.0", "elm/virtual-dom": "1.0.2", "rtfeldman/elm-hex": "1.0.0" diff --git a/scripts/import-channel b/scripts/import-channel index 36a6bc2..7dbc702 100755 --- a/scripts/import-channel +++ b/scripts/import-channel @@ -13,7 +13,6 @@ import boto3 import botocore import botocore.client -import xml.etree.ElementTree import click import click_log import elasticsearch @@ -22,10 +21,12 @@ import json import logging import os.path import pypandoc +import re import requests import shlex import subprocess import tqdm +import xml.etree.ElementTree logger = logging.getLogger("import-channel") click_log.basic_config(logger) @@ -33,7 +34,7 @@ click_log.basic_config(logger) S3_BUCKET = "nix-releases" CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) -INDEX_SCHEMA_VERSION = 5 +INDEX_SCHEMA_VERSION = 6 CHANNELS = { "unstable": { "packages": "nixpkgs/nixpkgs-20.09pre", @@ -49,11 +50,18 @@ CHANNELS = { }, } ANALYSIS = { - "analyzer": { - "nixAttrName": { + "normalizer": { + "lowercase": { "type": "custom", - "tokenizer": "nix_attrname", - "filter": ["lowercase", "nix_stopwords"], + "char_filter": [], + "filter": ["lowercase"], + } + }, + "analyzer": { + "lowercase": { + "type": "custom", + "tokenizer": "keyword", + "filter": ["lowercase"], }, "nixOptionName": { "type": "custom", @@ -67,13 +75,24 @@ ANALYSIS = { }, }, "tokenizer": { - "nix_attrname": { + "nix_package_query": { + "type": "pattern", + "pattern": "|".join( + [ + "[ ]", + ] + ), + }, + "nix_package_attr_name": { "type": "pattern", # Split on attrname separators like _, . "pattern": "|".join( [ "[_.-]", # Common separators like underscores, dots and dashes "\\d+?Packages", # python37Packages -> python + "\\d+?Plugins", # vimPlugins -> vim + "\\d+?Extensions", # php74Extensions -> php + "\\d+?Interpreters", # perlInterpreters -> perl # Camelcase tokenizer adapted from # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html "".join( @@ -118,7 +137,18 @@ ANALYSIS = { "nix_stopwords": { "type": "stop", "ignore_case": True, - "stopwords": ["packages", "package", "options", "option"], + "stopwords": [ + "packages", + "package", + "options", + "option", + "plugins", + "plugin", + "extensions", + "extension", + "interpreters", + "interpreter", + ], }, }, } @@ -146,12 +176,21 @@ MAPPING = { }, }, "package_attr_name": { - "type": "text", - "analyzer": "nixAttrName", - "fields": {"raw": {"type": "keyword"}}, + "type": "keyword", + "normalizer": "lowercase", + }, + "package_attr_name_query": { + "type": "keyword", + "normalizer": "lowercase", + }, + "package_attr_set": { + "type": "keyword", + "normalizer": "lowercase", + }, + "package_pname": { + "type": "keyword", + "normalizer": "lowercase", }, - "package_attr_set": {"type": "keyword"}, - "package_pname": {"type": "keyword"}, "package_pversion": {"type": "keyword"}, "package_description": {"type": "text"}, "package_longDescription": {"type": "text"}, @@ -195,6 +234,39 @@ MAPPING = { } +def split_query(text): + """Tokenize package attr_name + + Example: + + python37Packages.test_name-test + = index: 0 + - python37Packages.test1_name-test2 + - python37Packages.test1_name + - python37Packages.test1 + - python37 + - python + = index: 1 + - test1_name-test2 + - test1_name + - test1 + = index: 2 + - name-test2 + - name + = index: 3 + - test2 + """ + tokens = [] + regex = re.compile(".+?(?:(?<=[a-z])(?=[1-9A-Z])|(?<=[1-9A-Z])(?=[A-Z][a-z])|[\._-]|$)") + parts = [m.group(0) for m in regex.finditer(text)] + for index in range(len(parts)): + prev_parts = "" + for part in parts[index:]: + tokens.append((prev_parts + part).rstrip("_.-")) + prev_parts += part + return tokens + + def get_last_evaluation(prefix): logger.debug(f"Retriving last evaluation for {prefix} prefix.") @@ -265,6 +337,63 @@ def get_evaluation_builds(evaluation_id): return result +def get_maintainer(maintainer): + maintainers = [] + + if type(maintainer) == str: + maintainers.append(dict( + name=maintainer, + email=None, + github=None, + )) + + elif type(maintainer) == dict: + maintainers.append(dict( + name=maintainer.get("name"), + email=maintainer.get("email"), + github=maintainer.get("github"), + )) + + elif type(maintainer) == list: + for item in maintainer: + maintainers += get_maintainer(item) + + else: + logger.error(f"maintainer can not be recognized from: {maintainer}") + sys.exit(1) + + return maintainers + + +def remove_attr_set(name): + # some package sets the prefix is included in pname + sets = [ + # Packages + "emscripten", + "lua", + "php", + "pure", + "python", + "lisp", + "perl", + "ruby", + # Plugins + "elasticsearch", + "graylog", + "tmuxplugin" + "vimplugin" + ] + # TODO: is this correct + if any([name.startswith(i) for i in sets]): + name = "-".join(name.split("-")[1:]) + + # node does things a bit different + elif name.startswith("node_"): + name = name[len("node_"):] + + return name + + def get_packages(evaluation, evaluation_builds): logger.debug( f"get_packages: Retriving list of packages for '{evaluation['git_revision']}' revision" @@ -281,6 +410,7 @@ def get_packages(evaluation, evaluation_builds): def gen(): for attr_name, data in packages: + position = data["meta"].get("position") if position and position.startswith("/nix/store"): position = position[44:] @@ -300,16 +430,7 @@ def get_packages(evaluation, evaluation_builds): else: licenses = [] - maintainers = [ - type(maintainer) == str - and dict(name=maintainer, email=None, github=None) - or dict( - name=maintainer.get("name"), - email=maintainer.get("email"), - github=maintainer.get("github"), - ) - for maintainer in data["meta"].get("maintainers", []) - ] + maintainers = get_maintainer(data["meta"].get("maintainers", [])) platforms = [ type(platform) == str and platform or None @@ -319,9 +440,9 @@ def get_packages(evaluation, evaluation_builds): attr_set = None if "." in attr_name: attr_set = attr_name.split(".")[0] - if not attr_set.endswith("Packages") and not attr_set.endswith( - "Plugins" - ): + if not attr_set.endswith("Packages") and \ + not attr_set.endswith("Plugins") and \ + not attr_set.endswith("Extensions"): attr_set = None hydra = None @@ -349,8 +470,9 @@ def get_packages(evaluation, evaluation_builds): type="package", package_hydra=hydra, package_attr_name=attr_name, + package_attr_name_query=list(split_query(attr_name)), package_attr_set=attr_set, - package_pname=data["pname"], + package_pname=remove_attr_set(data["pname"]), package_pversion=data["version"], package_description=data["meta"].get("description"), package_longDescription=data["meta"].get("longDescription", ""), @@ -405,7 +527,7 @@ def get_options(evaluation): # we first check if there are some xml elements before using pypandoc # since pypandoc calls are quite slow root = xml.etree.ElementTree.fromstring(xml_description) - if len(root.find('para').getchildren()) > 0: + if len(list(root.find('para'))) > 0: description = pypandoc.convert_text( xml_description, "html", diff --git a/scripts/packages-config.nix b/scripts/packages-config.nix index 38b95ca..4082dcb 100644 --- a/scripts/packages-config.nix +++ b/scripts/packages-config.nix @@ -5,8 +5,38 @@ # Enable recursion into attribute sets that nix-env normally doesn't look into # so that we can get a more complete picture of the available packages for the # purposes of the index. - packageOverrides = super: { - haskellPackages = super.recurseIntoAttrs super.haskellPackages; - rPackages = super.recurseIntoAttrs super.rPackages; - }; + packageOverrides = super: + let + recurseIntoAttrs = sets: + super.lib.genAttrs + (builtins.filter (set: builtins.hasAttr set super) sets) + (set: super.recurseIntoAttrs (builtins.getAttr set super)); + in recurseIntoAttrs [ + "roundcubePlugins" + "emscriptenfastcompPackages" + "fdbPackages" + "nodePackages_latest" + "nodePackages" + "platformioPackages" + "haskellPackages" + "idrisPackages" + "sconsPackages" + "gns3Packages" + "quicklispPackagesClisp" + "quicklispPackagesSBCL" + "rPackages" + "apacheHttpdPackages_2_4" + "zabbix44" + "zabbix40" + "zabbix30" + "fusePackages" + "nvidiaPackages" + "sourceHanPackages" + "atomPackages" + "emacs25Packages" + "emacs26Packages" + "steamPackages" + "ut2004Packages" + "zeroadPackages" + ]; } diff --git a/src/Page/Packages.elm b/src/Page/Packages.elm index 0d1de25..93c665e 100644 --- a/src/Page/Packages.elm +++ b/src/Page/Packages.elm @@ -19,6 +19,7 @@ import Html , dl , dt , li + , p , table , tbody , td @@ -42,6 +43,7 @@ import Http import Json.Decode import Json.Decode.Pipeline import Json.Encode +import Regex import Search @@ -186,13 +188,33 @@ viewResultItem channel show item = else [] in - tr [ onClick (SearchMsg (Search.ShowDetails item.source.attr_name)) ] - [ td [] [ text item.source.attr_name ] - , td [] [ text item.source.pname ] - , td [] [ text item.source.pversion ] - , td [] [ text <| Maybe.withDefault "" item.source.description ] - ] - :: packageDetails + [] + -- DEBUG: |> List.append + -- DEBUG: [ tr [] + -- DEBUG: [ td [ colspan 4 ] + -- DEBUG: [ p [] [ text <| "score: " ++ String.fromFloat item.score ] + -- DEBUG: , p [] + -- DEBUG: [ text <| + -- DEBUG: "matched queries: " + -- DEBUG: , ul [] + -- DEBUG: (item.matched_queries + -- DEBUG: |> Maybe.withDefault [] + -- DEBUG: |> List.sort + -- DEBUG: |> List.map (\q -> li [] [ text q ]) + -- DEBUG: ) + -- DEBUG: ] + -- DEBUG: ] + -- DEBUG: ] + -- DEBUG: ] + |> List.append + (tr [ onClick (SearchMsg (Search.ShowDetails item.source.attr_name)) ] + [ td [] [ text <| item.source.attr_name ] + , td [] [ text item.source.pname ] + , td [] [ text item.source.pversion ] + , td [] [ text <| Maybe.withDefault "" item.source.description ] + ] + :: packageDetails + ) viewResultItemDetails : @@ -345,126 +367,232 @@ makeRequestBody : -> Int -> Int -> Http.Body -makeRequestBody query from size = - -- Prefix Query - -- example query for "python" - -- { - -- "from": 0, - -- "size": 10, - -- "query": { - -- "bool": { - -- "filter": { - -- "match": { - -- "type": "package" - -- } - -- }, - -- "must": { - -- "bool": { - -- "should": [ - -- { - -- "multi_match": { - -- "query": "python", - -- "boost": 1, - -- "fields": [ - -- "package_attr_name.raw", - -- "package_attr_name" - -- ], - -- "type": "most_fields" - -- } - -- }, - -- { - -- "term": { - -- "type": { - -- "value": "package", - -- "boost": 0 - -- } - -- } - -- }, - -- { - -- "term": { - -- "package_pname": { - -- "value": "python", - -- "boost": 2 - -- } - -- } - -- }, - -- { - -- "term": { - -- "package_pversion": { - -- "value": "python", - -- "boost": 0.2 - -- } - -- } - -- }, - -- { - -- "term": { - -- "package_description": { - -- "value": "python", - -- "boost": 0.3 - -- } - -- } - -- }, - -- { - -- "term": { - -- "package_longDescription": { - -- "value": "python", - -- "boost": 0.1 - -- } - -- } - -- } - -- ] - -- } - -- } - -- } - -- } - -- } +makeRequestBody queryRaw from size = let - listIn name type_ value = - [ ( name, Json.Encode.list type_ value ) ] + query = + queryRaw + |> String.trim - objectIn name value = - [ ( name, Json.Encode.object value ) ] + delimiters = + Maybe.withDefault Regex.never (Regex.fromString "[. ]") - encodeTerm ( name, boost ) = - [ ( "value", Json.Encode.string query ) - , ( "boost", Json.Encode.float boost ) - ] - |> objectIn name - |> objectIn "term" - in - [ ( "package_pname", 2.0 ) - , ( "package_pversion", 0.2 ) - , ( "package_description", 0.3 ) - , ( "package_longDescription", 0.1 ) - ] - |> List.map encodeTerm - |> List.append - [ [ "package_attr_name.raw" - , "package_attr_name" - ] - |> listIn "fields" Json.Encode.string - |> List.append - [ ( "query", Json.Encode.string query ) - , ( "boost", Json.Encode.float 1.0 ) + should_match boost_base = + List.indexedMap + (\i ( field, boost ) -> + [ ( "match" + , Json.Encode.object + [ ( field + , Json.Encode.object + [ ( "query", Json.Encode.string query ) + , ( "boost", Json.Encode.float boost ) + , ( "analyzer", Json.Encode.string "whitespace" ) + , ( "fuzziness", Json.Encode.string "1" ) + , ( "_name" + , Json.Encode.string <| + "should_match_" + ++ String.fromInt (i + 1) + ) + ] + ) + ] + ) ] - |> objectIn "multi_match" + ) + [ ( "package_attr_name", 1 ) + , ( "package_attr_name_query", 1 ) + , ( "package_pname", 1 ) + , ( "package_description", 1 ) + , ( "package_longDescription", 1 ) + ] + + should_match_bool_prefix boost_base = + List.indexedMap + (\i ( field, boost ) -> + [ ( "match_bool_prefix" + , Json.Encode.object + [ ( field + , Json.Encode.object + [ ( "query", Json.Encode.string query ) + , ( "boost", Json.Encode.float boost ) + , ( "analyzer", Json.Encode.string "whitespace" ) + , ( "fuzziness", Json.Encode.string "1" ) + , ( "_name" + , Json.Encode.string <| + "should_match_bool_prefix_" + ++ String.fromInt (i + 1) + ) + ] + ) + ] + ) + ] + ) + [ ( "package_attr_name", 1 ) + , ( "package_attr_name_query", 1 ) + , ( "package_pname", 1 ) + ] + + should_terms boost_base = + List.indexedMap + (\i ( field, boost ) -> + [ ( "terms" + , Json.Encode.object + [ ( field + , Json.Encode.list Json.Encode.string (Regex.split delimiters query) + ) + , ( "boost", Json.Encode.float <| boost_base * boost ) + , ( "_name" + , Json.Encode.string <| + "should_terms_" + ++ String.fromInt (i + 1) + ) + ] + ) + ] + ) + [ ( "package_attr_name", 1 ) + , ( "package_attr_name_query", 1 ) + , ( "package_pname", 1 ) + , ( "package_attr_set", 1 ) + ] + + should_term boost_base = + List.indexedMap + (\i ( field, boost ) -> + [ ( "term" + , Json.Encode.object + [ ( field + , Json.Encode.object + [ ( "value", Json.Encode.string query ) + , ( "boost", Json.Encode.float <| boost_base * boost ) + , ( "_name" + , Json.Encode.string <| + "should_term_" + ++ String.fromInt (i + 1) + ) + ] + ) + ] + ) + ] + ) + [ ( "package_attr_name", 1 ) + , ( "package_attr_name_query", 1 ) + , ( "package_pname", 1 ) + ] + + filter_packages = + ( "term" + , Json.Encode.object + [ ( "type" + , Json.Encode.object + [ ( "value", Json.Encode.string "package" ) + , ( "_name", Json.Encode.string "filter_packages" ) + ] + ) + ] + ) + + filter_queries = + let + filterQuery = + query + |> String.replace "." " " + in + filterQuery + |> String.words + |> List.indexedMap + (\i query_word -> + let + isLast = + List.length (String.words filterQuery) == i + 1 + in + [ if isLast then + ( "bool" + , Json.Encode.object + [ ( "should" + , Json.Encode.list Json.Encode.object + [ [ ( "match" + , Json.Encode.object + [ ( "package_attr_name_query" + , Json.Encode.object + [ ( "query", Json.Encode.string query_word ) + , ( "fuzziness", Json.Encode.string "1" ) + , ( "_name", Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_should_match" ) + ] + ) + ] + ) + ] + , [ ( "match_bool_prefix" + , Json.Encode.object + [ ( "package_attr_name_query" + , Json.Encode.object + [ ( "query", Json.Encode.string query_word ) + , ( "_name" + , Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_should_prefix" + ) + ] + ) + ] + ) + ] + ] + ) + ] + ) + + else + ( "match_bool_prefix" + , Json.Encode.object + [ ( "package_attr_name_query" + , Json.Encode.object + [ ( "query", Json.Encode.string query_word ) + , ( "_name" + , Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_prefix" + ) + ] + ) + ] + ) + ] + ) + in + Http.jsonBody + (Json.Encode.object + [ ( "from" + , Json.Encode.int from + ) + , ( "size" + , Json.Encode.int size + ) + , ( "query" + , Json.Encode.object + [ ( "bool" + , Json.Encode.object + [ ( "filter" + , Json.Encode.list Json.Encode.object + (List.append + [ [ filter_packages ] ] + filter_queries + ) + ) + , ( "should" + , Json.Encode.list + Json.Encode.object + ([] + |> List.append (should_term 10000) + |> List.append (should_terms 1000) + |> List.append (should_match_bool_prefix 100) + |> List.append (should_match 10) + ) + ) + ] + ) + ] + ) ] - |> listIn "should" Json.Encode.object - |> objectIn "bool" - |> objectIn "must" - |> ([ ( "type", Json.Encode.string "package" ) ] - |> objectIn "match" - |> objectIn "filter" - |> List.append - ) - |> objectIn "bool" - |> objectIn "query" - |> List.append - [ ( "from", Json.Encode.int from ) - , ( "size", Json.Encode.int size ) - ] - |> Json.Encode.object - |> Http.jsonBody + ) makeRequest : diff --git a/src/Search.elm b/src/Search.elm index 4fa6c7a..ba446f0 100644 --- a/src/Search.elm +++ b/src/Search.elm @@ -91,6 +91,7 @@ type alias ResultItem a = , id : String , score : Float , source : a + , matched_queries : Maybe (List String) } @@ -622,8 +623,9 @@ decodeResultHitsTotal = decodeResultItem : Json.Decode.Decoder a -> Json.Decode.Decoder (ResultItem a) decodeResultItem decodeResultItemSource = - Json.Decode.map4 ResultItem + Json.Decode.map5 ResultItem (Json.Decode.field "_index" Json.Decode.string) (Json.Decode.field "_id" Json.Decode.string) (Json.Decode.field "_score" Json.Decode.float) (Json.Decode.field "_source" decodeResultItemSource) + (Json.Decode.maybe (Json.Decode.field "matched_queries" (Json.Decode.list Json.Decode.string))) diff --git a/src/index.js b/src/index.js index 7d24c99..0733230 100644 --- a/src/index.js +++ b/src/index.js @@ -6,7 +6,7 @@ const {Elm} = require('./Main'); Elm.Main.init({ flags: { - elasticsearchMappingSchemaVersion: process.env.ELASTICSEARCH_MAPPING_SCHEMA_VERSION || 5, + elasticsearchMappingSchemaVersion: process.env.ELASTICSEARCH_MAPPING_SCHEMA_VERSION || 6, elasticsearchUrl: process.env.ELASTICSEARCH_URL || 'https://nixos-search-5886075189.us-east-1.bonsaisearch.net:443', elasticsearchUsername : process.env.ELASTICSEARCH_USERNAME || 'z3ZFJ6y2mR', elasticsearchPassword : process.env.ELASTICSEARCH_PASSWORD || 'ds8CEvALPf9pui7XG'