improve packages search query (#102)

This commit is contained in:
Rok Garbas 2020-06-18 12:24:52 +02:00 committed by GitHub
parent 0fb5f699b9
commit c420d05815
Failed to generate hash of commit
8 changed files with 458 additions and 157 deletions

1
.gitignore vendored
View file

@ -31,3 +31,4 @@ dist
package-lock.json package-lock.json
result result
scripts/eval-* scripts/eval-*
eval-*

View file

@ -18,6 +18,24 @@ For backend we are using Elasticsearch instance which is kindly sponsored by
[Elm](https://elm-lang.org). [Elm](https://elm-lang.org).
## How search works?
The use case we want to solve is that a visitor want to see if a package
exists or to look up certain package's details.
A user wants to converge to a single result if possible. The more characters
are added to a search query the more narrow is search is and we should show
less results.
Very important is also ranking of search results. This will bring more relevant
search results to the top, since a lot of times it is hard to produce search
query that will output only one result item.
A less important, but providing better user experience. are suggestions for
writing better search query. Suggesting feature should guide user to write
better queries which in turn will produce better results.
## Ideas we want to explore ## Ideas we want to explore
Apart from searching packages and options we would like to: Apart from searching packages and options we would like to:

View file

@ -12,6 +12,7 @@
"elm/html": "1.0.0", "elm/html": "1.0.0",
"elm/http": "2.0.0", "elm/http": "2.0.0",
"elm/json": "1.1.3", "elm/json": "1.1.3",
"elm/regex": "1.0.0",
"elm/url": "1.0.0", "elm/url": "1.0.0",
"hecrj/html-parser": "2.3.4", "hecrj/html-parser": "2.3.4",
"krisajenkins/remotedata": "6.0.1", "krisajenkins/remotedata": "6.0.1",
@ -21,7 +22,6 @@
"elm/bytes": "1.0.8", "elm/bytes": "1.0.8",
"elm/file": "1.0.5", "elm/file": "1.0.5",
"elm/parser": "1.1.0", "elm/parser": "1.1.0",
"elm/regex": "1.0.0",
"elm/time": "1.0.0", "elm/time": "1.0.0",
"elm/virtual-dom": "1.0.2", "elm/virtual-dom": "1.0.2",
"rtfeldman/elm-hex": "1.0.0" "rtfeldman/elm-hex": "1.0.0"

View file

@ -13,7 +13,6 @@
import boto3 import boto3
import botocore import botocore
import botocore.client import botocore.client
import xml.etree.ElementTree
import click import click
import click_log import click_log
import elasticsearch import elasticsearch
@ -22,10 +21,12 @@ import json
import logging import logging
import os.path import os.path
import pypandoc import pypandoc
import re
import requests import requests
import shlex import shlex
import subprocess import subprocess
import tqdm import tqdm
import xml.etree.ElementTree
logger = logging.getLogger("import-channel") logger = logging.getLogger("import-channel")
click_log.basic_config(logger) click_log.basic_config(logger)
@ -33,7 +34,7 @@ click_log.basic_config(logger)
S3_BUCKET = "nix-releases" S3_BUCKET = "nix-releases"
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
INDEX_SCHEMA_VERSION = 5 INDEX_SCHEMA_VERSION = 6
CHANNELS = { CHANNELS = {
"unstable": { "unstable": {
"packages": "nixpkgs/nixpkgs-20.09pre", "packages": "nixpkgs/nixpkgs-20.09pre",
@ -49,11 +50,18 @@ CHANNELS = {
}, },
} }
ANALYSIS = { ANALYSIS = {
"analyzer": { "normalizer": {
"nixAttrName": { "lowercase": {
"type": "custom", "type": "custom",
"tokenizer": "nix_attrname", "char_filter": [],
"filter": ["lowercase", "nix_stopwords"], "filter": ["lowercase"],
}
},
"analyzer": {
"lowercase": {
"type": "custom",
"tokenizer": "keyword",
"filter": ["lowercase"],
}, },
"nixOptionName": { "nixOptionName": {
"type": "custom", "type": "custom",
@ -67,13 +75,24 @@ ANALYSIS = {
}, },
}, },
"tokenizer": { "tokenizer": {
"nix_attrname": { "nix_package_query": {
"type": "pattern",
"pattern": "|".join(
[
"[ ]",
]
),
},
"nix_package_attr_name": {
"type": "pattern", "type": "pattern",
# Split on attrname separators like _, . # Split on attrname separators like _, .
"pattern": "|".join( "pattern": "|".join(
[ [
"[_.-]", # Common separators like underscores, dots and dashes "[_.-]", # Common separators like underscores, dots and dashes
"\\d+?Packages", # python37Packages -> python "\\d+?Packages", # python37Packages -> python
"\\d+?Plugins", # vimPlugins -> vim
"\\d+?Extensions", # php74Extensions -> php
"\\d+?Interpreters", # perlInterpreters -> perl
# Camelcase tokenizer adapted from # Camelcase tokenizer adapted from
# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html
"".join( "".join(
@ -118,7 +137,18 @@ ANALYSIS = {
"nix_stopwords": { "nix_stopwords": {
"type": "stop", "type": "stop",
"ignore_case": True, "ignore_case": True,
"stopwords": ["packages", "package", "options", "option"], "stopwords": [
"packages",
"package",
"options",
"option",
"plugins",
"plugin",
"extensions",
"extension",
"interpreters",
"interpreter",
],
}, },
}, },
} }
@ -146,12 +176,21 @@ MAPPING = {
}, },
}, },
"package_attr_name": { "package_attr_name": {
"type": "text", "type": "keyword",
"analyzer": "nixAttrName", "normalizer": "lowercase",
"fields": {"raw": {"type": "keyword"}}, },
"package_attr_name_query": {
"type": "keyword",
"normalizer": "lowercase",
},
"package_attr_set": {
"type": "keyword",
"normalizer": "lowercase",
},
"package_pname": {
"type": "keyword",
"normalizer": "lowercase",
}, },
"package_attr_set": {"type": "keyword"},
"package_pname": {"type": "keyword"},
"package_pversion": {"type": "keyword"}, "package_pversion": {"type": "keyword"},
"package_description": {"type": "text"}, "package_description": {"type": "text"},
"package_longDescription": {"type": "text"}, "package_longDescription": {"type": "text"},
@ -195,6 +234,39 @@ MAPPING = {
} }
def split_query(text):
"""Tokenize package attr_name
Example:
python37Packages.test_name-test
= index: 0
- python37Packages.test1_name-test2
- python37Packages.test1_name
- python37Packages.test1
- python37
- python
= index: 1
- test1_name-test2
- test1_name
- test1
= index: 2
- name-test2
- name
= index: 3
- test2
"""
tokens = []
regex = re.compile(".+?(?:(?<=[a-z])(?=[1-9A-Z])|(?<=[1-9A-Z])(?=[A-Z][a-z])|[\._-]|$)")
parts = [m.group(0) for m in regex.finditer(text)]
for index in range(len(parts)):
prev_parts = ""
for part in parts[index:]:
tokens.append((prev_parts + part).rstrip("_.-"))
prev_parts += part
return tokens
def get_last_evaluation(prefix): def get_last_evaluation(prefix):
logger.debug(f"Retriving last evaluation for {prefix} prefix.") logger.debug(f"Retriving last evaluation for {prefix} prefix.")
@ -265,6 +337,63 @@ def get_evaluation_builds(evaluation_id):
return result return result
def get_maintainer(maintainer):
maintainers = []
if type(maintainer) == str:
maintainers.append(dict(
name=maintainer,
email=None,
github=None,
))
elif type(maintainer) == dict:
maintainers.append(dict(
name=maintainer.get("name"),
email=maintainer.get("email"),
github=maintainer.get("github"),
))
elif type(maintainer) == list:
for item in maintainer:
maintainers += get_maintainer(item)
else:
logger.error(f"maintainer can not be recognized from: {maintainer}")
sys.exit(1)
return maintainers
def remove_attr_set(name):
# some package sets the prefix is included in pname
sets = [
# Packages
"emscripten",
"lua",
"php",
"pure",
"python",
"lisp",
"perl",
"ruby",
# Plugins
"elasticsearch",
"graylog",
"tmuxplugin"
"vimplugin"
]
# TODO: is this correct
if any([name.startswith(i) for i in sets]):
name = "-".join(name.split("-")[1:])
# node does things a bit different
elif name.startswith("node_"):
name = name[len("node_"):]
return name
def get_packages(evaluation, evaluation_builds): def get_packages(evaluation, evaluation_builds):
logger.debug( logger.debug(
f"get_packages: Retriving list of packages for '{evaluation['git_revision']}' revision" f"get_packages: Retriving list of packages for '{evaluation['git_revision']}' revision"
@ -281,6 +410,7 @@ def get_packages(evaluation, evaluation_builds):
def gen(): def gen():
for attr_name, data in packages: for attr_name, data in packages:
position = data["meta"].get("position") position = data["meta"].get("position")
if position and position.startswith("/nix/store"): if position and position.startswith("/nix/store"):
position = position[44:] position = position[44:]
@ -300,16 +430,7 @@ def get_packages(evaluation, evaluation_builds):
else: else:
licenses = [] licenses = []
maintainers = [ maintainers = get_maintainer(data["meta"].get("maintainers", []))
type(maintainer) == str
and dict(name=maintainer, email=None, github=None)
or dict(
name=maintainer.get("name"),
email=maintainer.get("email"),
github=maintainer.get("github"),
)
for maintainer in data["meta"].get("maintainers", [])
]
platforms = [ platforms = [
type(platform) == str and platform or None type(platform) == str and platform or None
@ -319,9 +440,9 @@ def get_packages(evaluation, evaluation_builds):
attr_set = None attr_set = None
if "." in attr_name: if "." in attr_name:
attr_set = attr_name.split(".")[0] attr_set = attr_name.split(".")[0]
if not attr_set.endswith("Packages") and not attr_set.endswith( if not attr_set.endswith("Packages") and \
"Plugins" not attr_set.endswith("Plugins") and \
): not attr_set.endswith("Extensions"):
attr_set = None attr_set = None
hydra = None hydra = None
@ -349,8 +470,9 @@ def get_packages(evaluation, evaluation_builds):
type="package", type="package",
package_hydra=hydra, package_hydra=hydra,
package_attr_name=attr_name, package_attr_name=attr_name,
package_attr_name_query=list(split_query(attr_name)),
package_attr_set=attr_set, package_attr_set=attr_set,
package_pname=data["pname"], package_pname=remove_attr_set(data["pname"]),
package_pversion=data["version"], package_pversion=data["version"],
package_description=data["meta"].get("description"), package_description=data["meta"].get("description"),
package_longDescription=data["meta"].get("longDescription", ""), package_longDescription=data["meta"].get("longDescription", ""),
@ -405,7 +527,7 @@ def get_options(evaluation):
# we first check if there are some xml elements before using pypandoc # we first check if there are some xml elements before using pypandoc
# since pypandoc calls are quite slow # since pypandoc calls are quite slow
root = xml.etree.ElementTree.fromstring(xml_description) root = xml.etree.ElementTree.fromstring(xml_description)
if len(root.find('para').getchildren()) > 0: if len(list(root.find('para'))) > 0:
description = pypandoc.convert_text( description = pypandoc.convert_text(
xml_description, xml_description,
"html", "html",

View file

@ -5,8 +5,38 @@
# Enable recursion into attribute sets that nix-env normally doesn't look into # Enable recursion into attribute sets that nix-env normally doesn't look into
# so that we can get a more complete picture of the available packages for the # so that we can get a more complete picture of the available packages for the
# purposes of the index. # purposes of the index.
packageOverrides = super: { packageOverrides = super:
haskellPackages = super.recurseIntoAttrs super.haskellPackages; let
rPackages = super.recurseIntoAttrs super.rPackages; recurseIntoAttrs = sets:
}; super.lib.genAttrs
(builtins.filter (set: builtins.hasAttr set super) sets)
(set: super.recurseIntoAttrs (builtins.getAttr set super));
in recurseIntoAttrs [
"roundcubePlugins"
"emscriptenfastcompPackages"
"fdbPackages"
"nodePackages_latest"
"nodePackages"
"platformioPackages"
"haskellPackages"
"idrisPackages"
"sconsPackages"
"gns3Packages"
"quicklispPackagesClisp"
"quicklispPackagesSBCL"
"rPackages"
"apacheHttpdPackages_2_4"
"zabbix44"
"zabbix40"
"zabbix30"
"fusePackages"
"nvidiaPackages"
"sourceHanPackages"
"atomPackages"
"emacs25Packages"
"emacs26Packages"
"steamPackages"
"ut2004Packages"
"zeroadPackages"
];
} }

View file

@ -19,6 +19,7 @@ import Html
, dl , dl
, dt , dt
, li , li
, p
, table , table
, tbody , tbody
, td , td
@ -42,6 +43,7 @@ import Http
import Json.Decode import Json.Decode
import Json.Decode.Pipeline import Json.Decode.Pipeline
import Json.Encode import Json.Encode
import Regex
import Search import Search
@ -186,13 +188,33 @@ viewResultItem channel show item =
else else
[] []
in in
tr [ onClick (SearchMsg (Search.ShowDetails item.source.attr_name)) ] []
[ td [] [ text item.source.attr_name ] -- DEBUG: |> List.append
-- DEBUG: [ tr []
-- DEBUG: [ td [ colspan 4 ]
-- DEBUG: [ p [] [ text <| "score: " ++ String.fromFloat item.score ]
-- DEBUG: , p []
-- DEBUG: [ text <|
-- DEBUG: "matched queries: "
-- DEBUG: , ul []
-- DEBUG: (item.matched_queries
-- DEBUG: |> Maybe.withDefault []
-- DEBUG: |> List.sort
-- DEBUG: |> List.map (\q -> li [] [ text q ])
-- DEBUG: )
-- DEBUG: ]
-- DEBUG: ]
-- DEBUG: ]
-- DEBUG: ]
|> List.append
(tr [ onClick (SearchMsg (Search.ShowDetails item.source.attr_name)) ]
[ td [] [ text <| item.source.attr_name ]
, td [] [ text item.source.pname ] , td [] [ text item.source.pname ]
, td [] [ text item.source.pversion ] , td [] [ text item.source.pversion ]
, td [] [ text <| Maybe.withDefault "" item.source.description ] , td [] [ text <| Maybe.withDefault "" item.source.description ]
] ]
:: packageDetails :: packageDetails
)
viewResultItemDetails : viewResultItemDetails :
@ -345,126 +367,232 @@ makeRequestBody :
-> Int -> Int
-> Int -> Int
-> Http.Body -> Http.Body
makeRequestBody query from size = makeRequestBody queryRaw from size =
-- Prefix Query
-- example query for "python"
-- {
-- "from": 0,
-- "size": 10,
-- "query": {
-- "bool": {
-- "filter": {
-- "match": {
-- "type": "package"
-- }
-- },
-- "must": {
-- "bool": {
-- "should": [
-- {
-- "multi_match": {
-- "query": "python",
-- "boost": 1,
-- "fields": [
-- "package_attr_name.raw",
-- "package_attr_name"
-- ],
-- "type": "most_fields"
-- }
-- },
-- {
-- "term": {
-- "type": {
-- "value": "package",
-- "boost": 0
-- }
-- }
-- },
-- {
-- "term": {
-- "package_pname": {
-- "value": "python",
-- "boost": 2
-- }
-- }
-- },
-- {
-- "term": {
-- "package_pversion": {
-- "value": "python",
-- "boost": 0.2
-- }
-- }
-- },
-- {
-- "term": {
-- "package_description": {
-- "value": "python",
-- "boost": 0.3
-- }
-- }
-- },
-- {
-- "term": {
-- "package_longDescription": {
-- "value": "python",
-- "boost": 0.1
-- }
-- }
-- }
-- ]
-- }
-- }
-- }
-- }
-- }
let let
listIn name type_ value = query =
[ ( name, Json.Encode.list type_ value ) ] queryRaw
|> String.trim
objectIn name value = delimiters =
[ ( name, Json.Encode.object value ) ] Maybe.withDefault Regex.never (Regex.fromString "[. ]")
encodeTerm ( name, boost ) = should_match boost_base =
[ ( "value", Json.Encode.string query ) List.indexedMap
, ( "boost", Json.Encode.float boost ) (\i ( field, boost ) ->
] [ ( "match"
|> objectIn name , Json.Encode.object
|> objectIn "term" [ ( field
in , Json.Encode.object
[ ( "package_pname", 2.0 )
, ( "package_pversion", 0.2 )
, ( "package_description", 0.3 )
, ( "package_longDescription", 0.1 )
]
|> List.map encodeTerm
|> List.append
[ [ "package_attr_name.raw"
, "package_attr_name"
]
|> listIn "fields" Json.Encode.string
|> List.append
[ ( "query", Json.Encode.string query ) [ ( "query", Json.Encode.string query )
, ( "boost", Json.Encode.float 1.0 ) , ( "boost", Json.Encode.float boost )
] , ( "analyzer", Json.Encode.string "whitespace" )
|> objectIn "multi_match" , ( "fuzziness", Json.Encode.string "1" )
] , ( "_name"
|> listIn "should" Json.Encode.object , Json.Encode.string <|
|> objectIn "bool" "should_match_"
|> objectIn "must" ++ String.fromInt (i + 1)
|> ([ ( "type", Json.Encode.string "package" ) ]
|> objectIn "match"
|> objectIn "filter"
|> List.append
) )
|> objectIn "bool"
|> objectIn "query"
|> List.append
[ ( "from", Json.Encode.int from )
, ( "size", Json.Encode.int size )
] ]
|> Json.Encode.object )
|> Http.jsonBody ]
)
]
)
[ ( "package_attr_name", 1 )
, ( "package_attr_name_query", 1 )
, ( "package_pname", 1 )
, ( "package_description", 1 )
, ( "package_longDescription", 1 )
]
should_match_bool_prefix boost_base =
List.indexedMap
(\i ( field, boost ) ->
[ ( "match_bool_prefix"
, Json.Encode.object
[ ( field
, Json.Encode.object
[ ( "query", Json.Encode.string query )
, ( "boost", Json.Encode.float boost )
, ( "analyzer", Json.Encode.string "whitespace" )
, ( "fuzziness", Json.Encode.string "1" )
, ( "_name"
, Json.Encode.string <|
"should_match_bool_prefix_"
++ String.fromInt (i + 1)
)
]
)
]
)
]
)
[ ( "package_attr_name", 1 )
, ( "package_attr_name_query", 1 )
, ( "package_pname", 1 )
]
should_terms boost_base =
List.indexedMap
(\i ( field, boost ) ->
[ ( "terms"
, Json.Encode.object
[ ( field
, Json.Encode.list Json.Encode.string (Regex.split delimiters query)
)
, ( "boost", Json.Encode.float <| boost_base * boost )
, ( "_name"
, Json.Encode.string <|
"should_terms_"
++ String.fromInt (i + 1)
)
]
)
]
)
[ ( "package_attr_name", 1 )
, ( "package_attr_name_query", 1 )
, ( "package_pname", 1 )
, ( "package_attr_set", 1 )
]
should_term boost_base =
List.indexedMap
(\i ( field, boost ) ->
[ ( "term"
, Json.Encode.object
[ ( field
, Json.Encode.object
[ ( "value", Json.Encode.string query )
, ( "boost", Json.Encode.float <| boost_base * boost )
, ( "_name"
, Json.Encode.string <|
"should_term_"
++ String.fromInt (i + 1)
)
]
)
]
)
]
)
[ ( "package_attr_name", 1 )
, ( "package_attr_name_query", 1 )
, ( "package_pname", 1 )
]
filter_packages =
( "term"
, Json.Encode.object
[ ( "type"
, Json.Encode.object
[ ( "value", Json.Encode.string "package" )
, ( "_name", Json.Encode.string "filter_packages" )
]
)
]
)
filter_queries =
let
filterQuery =
query
|> String.replace "." " "
in
filterQuery
|> String.words
|> List.indexedMap
(\i query_word ->
let
isLast =
List.length (String.words filterQuery) == i + 1
in
[ if isLast then
( "bool"
, Json.Encode.object
[ ( "should"
, Json.Encode.list Json.Encode.object
[ [ ( "match"
, Json.Encode.object
[ ( "package_attr_name_query"
, Json.Encode.object
[ ( "query", Json.Encode.string query_word )
, ( "fuzziness", Json.Encode.string "1" )
, ( "_name", Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_should_match" )
]
)
]
)
]
, [ ( "match_bool_prefix"
, Json.Encode.object
[ ( "package_attr_name_query"
, Json.Encode.object
[ ( "query", Json.Encode.string query_word )
, ( "_name"
, Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_should_prefix"
)
]
)
]
)
]
]
)
]
)
else
( "match_bool_prefix"
, Json.Encode.object
[ ( "package_attr_name_query"
, Json.Encode.object
[ ( "query", Json.Encode.string query_word )
, ( "_name"
, Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_prefix"
)
]
)
]
)
]
)
in
Http.jsonBody
(Json.Encode.object
[ ( "from"
, Json.Encode.int from
)
, ( "size"
, Json.Encode.int size
)
, ( "query"
, Json.Encode.object
[ ( "bool"
, Json.Encode.object
[ ( "filter"
, Json.Encode.list Json.Encode.object
(List.append
[ [ filter_packages ] ]
filter_queries
)
)
, ( "should"
, Json.Encode.list
Json.Encode.object
([]
|> List.append (should_term 10000)
|> List.append (should_terms 1000)
|> List.append (should_match_bool_prefix 100)
|> List.append (should_match 10)
)
)
]
)
]
)
]
)
makeRequest : makeRequest :

View file

@ -91,6 +91,7 @@ type alias ResultItem a =
, id : String , id : String
, score : Float , score : Float
, source : a , source : a
, matched_queries : Maybe (List String)
} }
@ -622,8 +623,9 @@ decodeResultHitsTotal =
decodeResultItem : Json.Decode.Decoder a -> Json.Decode.Decoder (ResultItem a) decodeResultItem : Json.Decode.Decoder a -> Json.Decode.Decoder (ResultItem a)
decodeResultItem decodeResultItemSource = decodeResultItem decodeResultItemSource =
Json.Decode.map4 ResultItem Json.Decode.map5 ResultItem
(Json.Decode.field "_index" Json.Decode.string) (Json.Decode.field "_index" Json.Decode.string)
(Json.Decode.field "_id" Json.Decode.string) (Json.Decode.field "_id" Json.Decode.string)
(Json.Decode.field "_score" Json.Decode.float) (Json.Decode.field "_score" Json.Decode.float)
(Json.Decode.field "_source" decodeResultItemSource) (Json.Decode.field "_source" decodeResultItemSource)
(Json.Decode.maybe (Json.Decode.field "matched_queries" (Json.Decode.list Json.Decode.string)))

View file

@ -6,7 +6,7 @@ const {Elm} = require('./Main');
Elm.Main.init({ Elm.Main.init({
flags: { flags: {
elasticsearchMappingSchemaVersion: process.env.ELASTICSEARCH_MAPPING_SCHEMA_VERSION || 5, elasticsearchMappingSchemaVersion: process.env.ELASTICSEARCH_MAPPING_SCHEMA_VERSION || 6,
elasticsearchUrl: process.env.ELASTICSEARCH_URL || 'https://nixos-search-5886075189.us-east-1.bonsaisearch.net:443', elasticsearchUrl: process.env.ELASTICSEARCH_URL || 'https://nixos-search-5886075189.us-east-1.bonsaisearch.net:443',
elasticsearchUsername : process.env.ELASTICSEARCH_USERNAME || 'z3ZFJ6y2mR', elasticsearchUsername : process.env.ELASTICSEARCH_USERNAME || 'z3ZFJ6y2mR',
elasticsearchPassword : process.env.ELASTICSEARCH_PASSWORD || 'ds8CEvALPf9pui7XG' elasticsearchPassword : process.env.ELASTICSEARCH_PASSWORD || 'ds8CEvALPf9pui7XG'