From 042cb40a8e7b237d4ba840b700bc93a7ef4e20c8 Mon Sep 17 00:00:00 2001 From: adisbladis Date: Tue, 19 May 2020 12:54:48 +0200 Subject: [PATCH] Use a custom attrname analyzer (#35) * Use unsigned boto s3 requests, Without this change you need s3 credentials, even though the bucket is public * Use custom attrname analyzer * Adapt query to new schema Use pname/pversion to not clash with elasticsearch parsing of version --- scripts/import-channels-into-elasticsearch | 69 +++++++++++++++++---- src/ElasticSearch.elm | 71 ++++++++++------------ src/Page/Packages.elm | 12 ++-- 3 files changed, 96 insertions(+), 56 deletions(-) diff --git a/scripts/import-channels-into-elasticsearch b/scripts/import-channels-into-elasticsearch index cc42bda..1a0158d 100755 --- a/scripts/import-channels-into-elasticsearch +++ b/scripts/import-channels-into-elasticsearch @@ -11,15 +11,54 @@ import os.path import shlex import subprocess import tqdm +import botocore.client +import botocore + CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) +ANALYSIS = { + 'analyzer': { + 'nixAttrName': { + 'type': 'custom', + 'tokenizer': 'nix_attrname', + 'filter': ['lowercase', 'nix_stopwords'], + }, + }, + 'tokenizer': { + 'nix_attrname': { + 'type': 'pattern', + # Split on attrname separators like _, . + 'pattern': "|".join([ + '[_.-]', # Common separators like underscores, dots and dashes + '\\d+?Packages', # python37Packages -> python + # Camelcase tokenizer adapted from + # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html + "".join([ + '(?<=[\\p{L}&&[^\\p{Lu}]])' # lower case + '(?=\\p{Lu})', # followed by upper case + '|', + '(?<=\\p{Lu})' # or upper case + '(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])', # followed by lower case + ]) + ]) + }, + }, + 'filter': { + 'nix_stopwords': { + 'type': 'stop', + 'ignore_case': True, + 'stopwords': ['packages', 'package', 'options', 'option'], + }, + }, +} + + def get_last_evaluation(channel): project, project_version = channel.split("-", 1) - - s3 = boto3.client("s3") + s3 = boto3.client("s3", config=botocore.client.Config(signature_version=botocore.UNSIGNED)) s3_result = s3.list_objects( Bucket="nix-releases", Prefix=f"{project}/{project_version}/", @@ -64,9 +103,9 @@ def get_packages(evaluation): licenses = data["meta"].get("license") if licenses: if type(licenses) == str: - licenses = [dict(fullName=licenses)] + licenses = [dict(fullName=licenses)] elif type(licenses) == dict: - licenses = [licenses] + licenses = [licenses] licenses = [ type(license) == str and dict(fullName=license, url=None) @@ -97,8 +136,8 @@ def get_packages(evaluation): yield dict( id=attr_name, attr_name=attr_name, - name=data["pname"], - version=data["version"], + pname=data["pname"], + pversion=data["version"], description=data["meta"].get("description"), longDescription=data["meta"].get("longDescription", ""), license=licenses, @@ -151,12 +190,20 @@ def recreate_index(es, channel): es.indices.create( index=f"{channel}-packages", body=dict( - settings=dict(number_of_shards=1), + settings=dict(number_of_shards=1, analysis=ANALYSIS), mappings=dict( properties=dict( - attr_name=dict(type="keyword"), - name=dict(type="keyword"), - version=dict(type="text"), + attr_name=dict( + type="text", + analyzer="nixAttrName", + fields={ + "raw": { + "type": "keyword", + } + }, + ), + pname=dict(type="keyword"), + pversion=dict(type="text"), description=dict(type="text"), longDescription=dict(type="text"), license=dict( @@ -186,7 +233,7 @@ def recreate_index(es, channel): es.indices.create( index=f"{channel}-options", body=dict( - settings=dict(number_of_shards=1), + settings=dict(number_of_shards=1, analysis=ANALYSIS), mappings=dict( properties=dict( option_name=dict(type="keyword"), diff --git a/src/ElasticSearch.elm b/src/ElasticSearch.elm index e840b8a..5ff063a 100644 --- a/src/ElasticSearch.elm +++ b/src/ElasticSearch.elm @@ -442,53 +442,46 @@ makeRequestBody : -> Int -> Http.Body makeRequestBody field query from size = - let - stringIn name value = - [ ( name, Json.Encode.string value ) ] - - objectIn name object = - [ ( name, Json.Encode.object object ) ] - in -- Prefix Query -- { - -- "" -- "query": { - -- "prefix": { - -- "user": { - -- "value": "" - -- } + -- "multi_match" : { + -- "query": "python37Packages.requests", + -- "fields": [ + -- "attr_name.raw", + -- "attr_name", + -- "pname", + -- "pversion", + -- "description", + -- "longDescription" + -- ] -- } -- } - -- } - --query - -- |> stringIn "value" - -- |> objectIn field - -- |> objectIn "prefix" - -- |> objectIn "query" - -- |> Json.Encode.object - -- |> Http.jsonBody - -- - -- Wildcard Query - -- { - -- "query": { - -- "wildcard": { - -- "": { - -- "value": "**", - -- } - -- } - -- } - -- } - ("*" ++ query ++ "*") - |> stringIn "value" - |> objectIn field - |> objectIn "wildcard" - |> objectIn "query" - |> List.append + Http.jsonBody + (Json.Encode.object [ ( "from", Json.Encode.int from ) , ( "size", Json.Encode.int size ) + , ( "query" + , Json.Encode.object + [ ( "multi_match" + , Json.Encode.object + [ ( "query", Json.Encode.string query ) + , ( "fields" + , Json.Encode.list Json.Encode.string + [ "attr_name.raw" + , "attr_name" + , "pname" + , "pversion" + , "description" + , "longDescription" + ] + ) + ] + ) + ] + ) ] - |> Json.Encode.object - |> Http.jsonBody + ) makeRequest : diff --git a/src/Page/Packages.elm b/src/Page/Packages.elm index 9016b3b..438b6a7 100644 --- a/src/Page/Packages.elm +++ b/src/Page/Packages.elm @@ -53,8 +53,8 @@ type alias Model = type alias ResultItemSource = { attr_name : String - , name : String - , version : String + , pname : String + , pversion : String , description : Maybe String , longDescription : Maybe String , licenses : List ResultPackageLicense @@ -163,8 +163,8 @@ viewResultItem showDetailsFor item = in tr [ onClick (SearchMsg (ElasticSearch.ShowDetails item.id)) ] [ td [] [ text item.source.attr_name ] - , td [] [ text item.source.name ] - , td [] [ text item.source.version ] + , td [] [ text item.source.pname ] + , td [] [ text item.source.pversion ] , td [] [ text <| Maybe.withDefault "" item.source.description ] ] :: packageDetails @@ -304,8 +304,8 @@ decodeResultItemSource : Json.Decode.Decoder ResultItemSource decodeResultItemSource = Json.Decode.succeed ResultItemSource |> Json.Decode.Pipeline.required "attr_name" Json.Decode.string - |> Json.Decode.Pipeline.required "name" Json.Decode.string - |> Json.Decode.Pipeline.required "version" Json.Decode.string + |> Json.Decode.Pipeline.required "pname" Json.Decode.string + |> Json.Decode.Pipeline.required "pversion" Json.Decode.string |> Json.Decode.Pipeline.required "description" (Json.Decode.nullable Json.Decode.string) |> Json.Decode.Pipeline.required "longDescription" (Json.Decode.nullable Json.Decode.string) |> Json.Decode.Pipeline.required "license" (Json.Decode.list decodeResultPackageLicense)