improve packages search query (#102)

This commit is contained in:
Rok Garbas 2020-06-18 12:24:52 +02:00 committed by GitHub
parent 0fb5f699b9
commit c420d05815
Failed to generate hash of commit
8 changed files with 458 additions and 157 deletions

1
.gitignore vendored
View file

@ -31,3 +31,4 @@ dist
package-lock.json
result
scripts/eval-*
eval-*

View file

@ -18,6 +18,24 @@ For backend we are using Elasticsearch instance which is kindly sponsored by
[Elm](https://elm-lang.org).
## How search works?
The use case we want to solve is that a visitor want to see if a package
exists or to look up certain package's details.
A user wants to converge to a single result if possible. The more characters
are added to a search query the more narrow is search is and we should show
less results.
Very important is also ranking of search results. This will bring more relevant
search results to the top, since a lot of times it is hard to produce search
query that will output only one result item.
A less important, but providing better user experience. are suggestions for
writing better search query. Suggesting feature should guide user to write
better queries which in turn will produce better results.
## Ideas we want to explore
Apart from searching packages and options we would like to:

View file

@ -12,6 +12,7 @@
"elm/html": "1.0.0",
"elm/http": "2.0.0",
"elm/json": "1.1.3",
"elm/regex": "1.0.0",
"elm/url": "1.0.0",
"hecrj/html-parser": "2.3.4",
"krisajenkins/remotedata": "6.0.1",
@ -21,7 +22,6 @@
"elm/bytes": "1.0.8",
"elm/file": "1.0.5",
"elm/parser": "1.1.0",
"elm/regex": "1.0.0",
"elm/time": "1.0.0",
"elm/virtual-dom": "1.0.2",
"rtfeldman/elm-hex": "1.0.0"

View file

@ -13,7 +13,6 @@
import boto3
import botocore
import botocore.client
import xml.etree.ElementTree
import click
import click_log
import elasticsearch
@ -22,10 +21,12 @@ import json
import logging
import os.path
import pypandoc
import re
import requests
import shlex
import subprocess
import tqdm
import xml.etree.ElementTree
logger = logging.getLogger("import-channel")
click_log.basic_config(logger)
@ -33,7 +34,7 @@ click_log.basic_config(logger)
S3_BUCKET = "nix-releases"
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
INDEX_SCHEMA_VERSION = 5
INDEX_SCHEMA_VERSION = 6
CHANNELS = {
"unstable": {
"packages": "nixpkgs/nixpkgs-20.09pre",
@ -49,11 +50,18 @@ CHANNELS = {
},
}
ANALYSIS = {
"analyzer": {
"nixAttrName": {
"normalizer": {
"lowercase": {
"type": "custom",
"tokenizer": "nix_attrname",
"filter": ["lowercase", "nix_stopwords"],
"char_filter": [],
"filter": ["lowercase"],
}
},
"analyzer": {
"lowercase": {
"type": "custom",
"tokenizer": "keyword",
"filter": ["lowercase"],
},
"nixOptionName": {
"type": "custom",
@ -67,13 +75,24 @@ ANALYSIS = {
},
},
"tokenizer": {
"nix_attrname": {
"nix_package_query": {
"type": "pattern",
"pattern": "|".join(
[
"[ ]",
]
),
},
"nix_package_attr_name": {
"type": "pattern",
# Split on attrname separators like _, .
"pattern": "|".join(
[
"[_.-]", # Common separators like underscores, dots and dashes
"\\d+?Packages", # python37Packages -> python
"\\d+?Plugins", # vimPlugins -> vim
"\\d+?Extensions", # php74Extensions -> php
"\\d+?Interpreters", # perlInterpreters -> perl
# Camelcase tokenizer adapted from
# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html
"".join(
@ -118,7 +137,18 @@ ANALYSIS = {
"nix_stopwords": {
"type": "stop",
"ignore_case": True,
"stopwords": ["packages", "package", "options", "option"],
"stopwords": [
"packages",
"package",
"options",
"option",
"plugins",
"plugin",
"extensions",
"extension",
"interpreters",
"interpreter",
],
},
},
}
@ -146,12 +176,21 @@ MAPPING = {
},
},
"package_attr_name": {
"type": "text",
"analyzer": "nixAttrName",
"fields": {"raw": {"type": "keyword"}},
"type": "keyword",
"normalizer": "lowercase",
},
"package_attr_name_query": {
"type": "keyword",
"normalizer": "lowercase",
},
"package_attr_set": {
"type": "keyword",
"normalizer": "lowercase",
},
"package_pname": {
"type": "keyword",
"normalizer": "lowercase",
},
"package_attr_set": {"type": "keyword"},
"package_pname": {"type": "keyword"},
"package_pversion": {"type": "keyword"},
"package_description": {"type": "text"},
"package_longDescription": {"type": "text"},
@ -195,6 +234,39 @@ MAPPING = {
}
def split_query(text):
"""Tokenize package attr_name
Example:
python37Packages.test_name-test
= index: 0
- python37Packages.test1_name-test2
- python37Packages.test1_name
- python37Packages.test1
- python37
- python
= index: 1
- test1_name-test2
- test1_name
- test1
= index: 2
- name-test2
- name
= index: 3
- test2
"""
tokens = []
regex = re.compile(".+?(?:(?<=[a-z])(?=[1-9A-Z])|(?<=[1-9A-Z])(?=[A-Z][a-z])|[\._-]|$)")
parts = [m.group(0) for m in regex.finditer(text)]
for index in range(len(parts)):
prev_parts = ""
for part in parts[index:]:
tokens.append((prev_parts + part).rstrip("_.-"))
prev_parts += part
return tokens
def get_last_evaluation(prefix):
logger.debug(f"Retriving last evaluation for {prefix} prefix.")
@ -265,6 +337,63 @@ def get_evaluation_builds(evaluation_id):
return result
def get_maintainer(maintainer):
maintainers = []
if type(maintainer) == str:
maintainers.append(dict(
name=maintainer,
email=None,
github=None,
))
elif type(maintainer) == dict:
maintainers.append(dict(
name=maintainer.get("name"),
email=maintainer.get("email"),
github=maintainer.get("github"),
))
elif type(maintainer) == list:
for item in maintainer:
maintainers += get_maintainer(item)
else:
logger.error(f"maintainer can not be recognized from: {maintainer}")
sys.exit(1)
return maintainers
def remove_attr_set(name):
# some package sets the prefix is included in pname
sets = [
# Packages
"emscripten",
"lua",
"php",
"pure",
"python",
"lisp",
"perl",
"ruby",
# Plugins
"elasticsearch",
"graylog",
"tmuxplugin"
"vimplugin"
]
# TODO: is this correct
if any([name.startswith(i) for i in sets]):
name = "-".join(name.split("-")[1:])
# node does things a bit different
elif name.startswith("node_"):
name = name[len("node_"):]
return name
def get_packages(evaluation, evaluation_builds):
logger.debug(
f"get_packages: Retriving list of packages for '{evaluation['git_revision']}' revision"
@ -281,6 +410,7 @@ def get_packages(evaluation, evaluation_builds):
def gen():
for attr_name, data in packages:
position = data["meta"].get("position")
if position and position.startswith("/nix/store"):
position = position[44:]
@ -300,16 +430,7 @@ def get_packages(evaluation, evaluation_builds):
else:
licenses = []
maintainers = [
type(maintainer) == str
and dict(name=maintainer, email=None, github=None)
or dict(
name=maintainer.get("name"),
email=maintainer.get("email"),
github=maintainer.get("github"),
)
for maintainer in data["meta"].get("maintainers", [])
]
maintainers = get_maintainer(data["meta"].get("maintainers", []))
platforms = [
type(platform) == str and platform or None
@ -319,9 +440,9 @@ def get_packages(evaluation, evaluation_builds):
attr_set = None
if "." in attr_name:
attr_set = attr_name.split(".")[0]
if not attr_set.endswith("Packages") and not attr_set.endswith(
"Plugins"
):
if not attr_set.endswith("Packages") and \
not attr_set.endswith("Plugins") and \
not attr_set.endswith("Extensions"):
attr_set = None
hydra = None
@ -349,8 +470,9 @@ def get_packages(evaluation, evaluation_builds):
type="package",
package_hydra=hydra,
package_attr_name=attr_name,
package_attr_name_query=list(split_query(attr_name)),
package_attr_set=attr_set,
package_pname=data["pname"],
package_pname=remove_attr_set(data["pname"]),
package_pversion=data["version"],
package_description=data["meta"].get("description"),
package_longDescription=data["meta"].get("longDescription", ""),
@ -405,7 +527,7 @@ def get_options(evaluation):
# we first check if there are some xml elements before using pypandoc
# since pypandoc calls are quite slow
root = xml.etree.ElementTree.fromstring(xml_description)
if len(root.find('para').getchildren()) > 0:
if len(list(root.find('para'))) > 0:
description = pypandoc.convert_text(
xml_description,
"html",

View file

@ -5,8 +5,38 @@
# Enable recursion into attribute sets that nix-env normally doesn't look into
# so that we can get a more complete picture of the available packages for the
# purposes of the index.
packageOverrides = super: {
haskellPackages = super.recurseIntoAttrs super.haskellPackages;
rPackages = super.recurseIntoAttrs super.rPackages;
};
packageOverrides = super:
let
recurseIntoAttrs = sets:
super.lib.genAttrs
(builtins.filter (set: builtins.hasAttr set super) sets)
(set: super.recurseIntoAttrs (builtins.getAttr set super));
in recurseIntoAttrs [
"roundcubePlugins"
"emscriptenfastcompPackages"
"fdbPackages"
"nodePackages_latest"
"nodePackages"
"platformioPackages"
"haskellPackages"
"idrisPackages"
"sconsPackages"
"gns3Packages"
"quicklispPackagesClisp"
"quicklispPackagesSBCL"
"rPackages"
"apacheHttpdPackages_2_4"
"zabbix44"
"zabbix40"
"zabbix30"
"fusePackages"
"nvidiaPackages"
"sourceHanPackages"
"atomPackages"
"emacs25Packages"
"emacs26Packages"
"steamPackages"
"ut2004Packages"
"zeroadPackages"
];
}

View file

@ -19,6 +19,7 @@ import Html
, dl
, dt
, li
, p
, table
, tbody
, td
@ -42,6 +43,7 @@ import Http
import Json.Decode
import Json.Decode.Pipeline
import Json.Encode
import Regex
import Search
@ -186,13 +188,33 @@ viewResultItem channel show item =
else
[]
in
tr [ onClick (SearchMsg (Search.ShowDetails item.source.attr_name)) ]
[ td [] [ text item.source.attr_name ]
, td [] [ text item.source.pname ]
, td [] [ text item.source.pversion ]
, td [] [ text <| Maybe.withDefault "" item.source.description ]
]
:: packageDetails
[]
-- DEBUG: |> List.append
-- DEBUG: [ tr []
-- DEBUG: [ td [ colspan 4 ]
-- DEBUG: [ p [] [ text <| "score: " ++ String.fromFloat item.score ]
-- DEBUG: , p []
-- DEBUG: [ text <|
-- DEBUG: "matched queries: "
-- DEBUG: , ul []
-- DEBUG: (item.matched_queries
-- DEBUG: |> Maybe.withDefault []
-- DEBUG: |> List.sort
-- DEBUG: |> List.map (\q -> li [] [ text q ])
-- DEBUG: )
-- DEBUG: ]
-- DEBUG: ]
-- DEBUG: ]
-- DEBUG: ]
|> List.append
(tr [ onClick (SearchMsg (Search.ShowDetails item.source.attr_name)) ]
[ td [] [ text <| item.source.attr_name ]
, td [] [ text item.source.pname ]
, td [] [ text item.source.pversion ]
, td [] [ text <| Maybe.withDefault "" item.source.description ]
]
:: packageDetails
)
viewResultItemDetails :
@ -345,126 +367,232 @@ makeRequestBody :
-> Int
-> Int
-> Http.Body
makeRequestBody query from size =
-- Prefix Query
-- example query for "python"
-- {
-- "from": 0,
-- "size": 10,
-- "query": {
-- "bool": {
-- "filter": {
-- "match": {
-- "type": "package"
-- }
-- },
-- "must": {
-- "bool": {
-- "should": [
-- {
-- "multi_match": {
-- "query": "python",
-- "boost": 1,
-- "fields": [
-- "package_attr_name.raw",
-- "package_attr_name"
-- ],
-- "type": "most_fields"
-- }
-- },
-- {
-- "term": {
-- "type": {
-- "value": "package",
-- "boost": 0
-- }
-- }
-- },
-- {
-- "term": {
-- "package_pname": {
-- "value": "python",
-- "boost": 2
-- }
-- }
-- },
-- {
-- "term": {
-- "package_pversion": {
-- "value": "python",
-- "boost": 0.2
-- }
-- }
-- },
-- {
-- "term": {
-- "package_description": {
-- "value": "python",
-- "boost": 0.3
-- }
-- }
-- },
-- {
-- "term": {
-- "package_longDescription": {
-- "value": "python",
-- "boost": 0.1
-- }
-- }
-- }
-- ]
-- }
-- }
-- }
-- }
-- }
makeRequestBody queryRaw from size =
let
listIn name type_ value =
[ ( name, Json.Encode.list type_ value ) ]
query =
queryRaw
|> String.trim
objectIn name value =
[ ( name, Json.Encode.object value ) ]
delimiters =
Maybe.withDefault Regex.never (Regex.fromString "[. ]")
encodeTerm ( name, boost ) =
[ ( "value", Json.Encode.string query )
, ( "boost", Json.Encode.float boost )
]
|> objectIn name
|> objectIn "term"
in
[ ( "package_pname", 2.0 )
, ( "package_pversion", 0.2 )
, ( "package_description", 0.3 )
, ( "package_longDescription", 0.1 )
]
|> List.map encodeTerm
|> List.append
[ [ "package_attr_name.raw"
, "package_attr_name"
]
|> listIn "fields" Json.Encode.string
|> List.append
[ ( "query", Json.Encode.string query )
, ( "boost", Json.Encode.float 1.0 )
should_match boost_base =
List.indexedMap
(\i ( field, boost ) ->
[ ( "match"
, Json.Encode.object
[ ( field
, Json.Encode.object
[ ( "query", Json.Encode.string query )
, ( "boost", Json.Encode.float boost )
, ( "analyzer", Json.Encode.string "whitespace" )
, ( "fuzziness", Json.Encode.string "1" )
, ( "_name"
, Json.Encode.string <|
"should_match_"
++ String.fromInt (i + 1)
)
]
)
]
)
]
|> objectIn "multi_match"
)
[ ( "package_attr_name", 1 )
, ( "package_attr_name_query", 1 )
, ( "package_pname", 1 )
, ( "package_description", 1 )
, ( "package_longDescription", 1 )
]
should_match_bool_prefix boost_base =
List.indexedMap
(\i ( field, boost ) ->
[ ( "match_bool_prefix"
, Json.Encode.object
[ ( field
, Json.Encode.object
[ ( "query", Json.Encode.string query )
, ( "boost", Json.Encode.float boost )
, ( "analyzer", Json.Encode.string "whitespace" )
, ( "fuzziness", Json.Encode.string "1" )
, ( "_name"
, Json.Encode.string <|
"should_match_bool_prefix_"
++ String.fromInt (i + 1)
)
]
)
]
)
]
)
[ ( "package_attr_name", 1 )
, ( "package_attr_name_query", 1 )
, ( "package_pname", 1 )
]
should_terms boost_base =
List.indexedMap
(\i ( field, boost ) ->
[ ( "terms"
, Json.Encode.object
[ ( field
, Json.Encode.list Json.Encode.string (Regex.split delimiters query)
)
, ( "boost", Json.Encode.float <| boost_base * boost )
, ( "_name"
, Json.Encode.string <|
"should_terms_"
++ String.fromInt (i + 1)
)
]
)
]
)
[ ( "package_attr_name", 1 )
, ( "package_attr_name_query", 1 )
, ( "package_pname", 1 )
, ( "package_attr_set", 1 )
]
should_term boost_base =
List.indexedMap
(\i ( field, boost ) ->
[ ( "term"
, Json.Encode.object
[ ( field
, Json.Encode.object
[ ( "value", Json.Encode.string query )
, ( "boost", Json.Encode.float <| boost_base * boost )
, ( "_name"
, Json.Encode.string <|
"should_term_"
++ String.fromInt (i + 1)
)
]
)
]
)
]
)
[ ( "package_attr_name", 1 )
, ( "package_attr_name_query", 1 )
, ( "package_pname", 1 )
]
filter_packages =
( "term"
, Json.Encode.object
[ ( "type"
, Json.Encode.object
[ ( "value", Json.Encode.string "package" )
, ( "_name", Json.Encode.string "filter_packages" )
]
)
]
)
filter_queries =
let
filterQuery =
query
|> String.replace "." " "
in
filterQuery
|> String.words
|> List.indexedMap
(\i query_word ->
let
isLast =
List.length (String.words filterQuery) == i + 1
in
[ if isLast then
( "bool"
, Json.Encode.object
[ ( "should"
, Json.Encode.list Json.Encode.object
[ [ ( "match"
, Json.Encode.object
[ ( "package_attr_name_query"
, Json.Encode.object
[ ( "query", Json.Encode.string query_word )
, ( "fuzziness", Json.Encode.string "1" )
, ( "_name", Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_should_match" )
]
)
]
)
]
, [ ( "match_bool_prefix"
, Json.Encode.object
[ ( "package_attr_name_query"
, Json.Encode.object
[ ( "query", Json.Encode.string query_word )
, ( "_name"
, Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_should_prefix"
)
]
)
]
)
]
]
)
]
)
else
( "match_bool_prefix"
, Json.Encode.object
[ ( "package_attr_name_query"
, Json.Encode.object
[ ( "query", Json.Encode.string query_word )
, ( "_name"
, Json.Encode.string <| "filter_queries_" ++ String.fromInt (i + 1) ++ "_prefix"
)
]
)
]
)
]
)
in
Http.jsonBody
(Json.Encode.object
[ ( "from"
, Json.Encode.int from
)
, ( "size"
, Json.Encode.int size
)
, ( "query"
, Json.Encode.object
[ ( "bool"
, Json.Encode.object
[ ( "filter"
, Json.Encode.list Json.Encode.object
(List.append
[ [ filter_packages ] ]
filter_queries
)
)
, ( "should"
, Json.Encode.list
Json.Encode.object
([]
|> List.append (should_term 10000)
|> List.append (should_terms 1000)
|> List.append (should_match_bool_prefix 100)
|> List.append (should_match 10)
)
)
]
)
]
)
]
|> listIn "should" Json.Encode.object
|> objectIn "bool"
|> objectIn "must"
|> ([ ( "type", Json.Encode.string "package" ) ]
|> objectIn "match"
|> objectIn "filter"
|> List.append
)
|> objectIn "bool"
|> objectIn "query"
|> List.append
[ ( "from", Json.Encode.int from )
, ( "size", Json.Encode.int size )
]
|> Json.Encode.object
|> Http.jsonBody
)
makeRequest :

View file

@ -91,6 +91,7 @@ type alias ResultItem a =
, id : String
, score : Float
, source : a
, matched_queries : Maybe (List String)
}
@ -622,8 +623,9 @@ decodeResultHitsTotal =
decodeResultItem : Json.Decode.Decoder a -> Json.Decode.Decoder (ResultItem a)
decodeResultItem decodeResultItemSource =
Json.Decode.map4 ResultItem
Json.Decode.map5 ResultItem
(Json.Decode.field "_index" Json.Decode.string)
(Json.Decode.field "_id" Json.Decode.string)
(Json.Decode.field "_score" Json.Decode.float)
(Json.Decode.field "_source" decodeResultItemSource)
(Json.Decode.maybe (Json.Decode.field "matched_queries" (Json.Decode.list Json.Decode.string)))

View file

@ -6,7 +6,7 @@ const {Elm} = require('./Main');
Elm.Main.init({
flags: {
elasticsearchMappingSchemaVersion: process.env.ELASTICSEARCH_MAPPING_SCHEMA_VERSION || 5,
elasticsearchMappingSchemaVersion: process.env.ELASTICSEARCH_MAPPING_SCHEMA_VERSION || 6,
elasticsearchUrl: process.env.ELASTICSEARCH_URL || 'https://nixos-search-5886075189.us-east-1.bonsaisearch.net:443',
elasticsearchUsername : process.env.ELASTICSEARCH_USERNAME || 'z3ZFJ6y2mR',
elasticsearchPassword : process.env.ELASTICSEARCH_PASSWORD || 'ds8CEvALPf9pui7XG'