Use a custom attrname analyzer (#35)

* Use unsigned boto s3 requests, 

Without this change you need s3 credentials, even though the bucket is public

* Use custom attrname analyzer

* Adapt query to new schema

Use pname/pversion to not clash with elasticsearch parsing of version
This commit is contained in:
adisbladis 2020-05-19 12:54:48 +02:00 committed by GitHub
parent 3816a7033a
commit 042cb40a8e
Failed to generate hash of commit
3 changed files with 96 additions and 56 deletions

View file

@ -11,15 +11,54 @@ import os.path
import shlex
import subprocess
import tqdm
import botocore.client
import botocore
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
ANALYSIS = {
'analyzer': {
'nixAttrName': {
'type': 'custom',
'tokenizer': 'nix_attrname',
'filter': ['lowercase', 'nix_stopwords'],
},
},
'tokenizer': {
'nix_attrname': {
'type': 'pattern',
# Split on attrname separators like _, .
'pattern': "|".join([
'[_.-]', # Common separators like underscores, dots and dashes
'\\d+?Packages', # python37Packages -> python
# Camelcase tokenizer adapted from
# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html
"".join([
'(?<=[\\p{L}&&[^\\p{Lu}]])' # lower case
'(?=\\p{Lu})', # followed by upper case
'|',
'(?<=\\p{Lu})' # or upper case
'(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])', # followed by lower case
])
])
},
},
'filter': {
'nix_stopwords': {
'type': 'stop',
'ignore_case': True,
'stopwords': ['packages', 'package', 'options', 'option'],
},
},
}
def get_last_evaluation(channel):
project, project_version = channel.split("-", 1)
s3 = boto3.client("s3")
s3 = boto3.client("s3", config=botocore.client.Config(signature_version=botocore.UNSIGNED))
s3_result = s3.list_objects(
Bucket="nix-releases",
Prefix=f"{project}/{project_version}/",
@ -64,9 +103,9 @@ def get_packages(evaluation):
licenses = data["meta"].get("license")
if licenses:
if type(licenses) == str:
licenses = [dict(fullName=licenses)]
licenses = [dict(fullName=licenses)]
elif type(licenses) == dict:
licenses = [licenses]
licenses = [licenses]
licenses = [
type(license) == str
and dict(fullName=license, url=None)
@ -97,8 +136,8 @@ def get_packages(evaluation):
yield dict(
id=attr_name,
attr_name=attr_name,
name=data["pname"],
version=data["version"],
pname=data["pname"],
pversion=data["version"],
description=data["meta"].get("description"),
longDescription=data["meta"].get("longDescription", ""),
license=licenses,
@ -151,12 +190,20 @@ def recreate_index(es, channel):
es.indices.create(
index=f"{channel}-packages",
body=dict(
settings=dict(number_of_shards=1),
settings=dict(number_of_shards=1, analysis=ANALYSIS),
mappings=dict(
properties=dict(
attr_name=dict(type="keyword"),
name=dict(type="keyword"),
version=dict(type="text"),
attr_name=dict(
type="text",
analyzer="nixAttrName",
fields={
"raw": {
"type": "keyword",
}
},
),
pname=dict(type="keyword"),
pversion=dict(type="text"),
description=dict(type="text"),
longDescription=dict(type="text"),
license=dict(
@ -186,7 +233,7 @@ def recreate_index(es, channel):
es.indices.create(
index=f"{channel}-options",
body=dict(
settings=dict(number_of_shards=1),
settings=dict(number_of_shards=1, analysis=ANALYSIS),
mappings=dict(
properties=dict(
option_name=dict(type="keyword"),

View file

@ -442,53 +442,46 @@ makeRequestBody :
-> Int
-> Http.Body
makeRequestBody field query from size =
let
stringIn name value =
[ ( name, Json.Encode.string value ) ]
objectIn name object =
[ ( name, Json.Encode.object object ) ]
in
-- Prefix Query
-- {
-- ""
-- "query": {
-- "prefix": {
-- "user": {
-- "value": ""
-- }
-- "multi_match" : {
-- "query": "python37Packages.requests",
-- "fields": [
-- "attr_name.raw",
-- "attr_name",
-- "pname",
-- "pversion",
-- "description",
-- "longDescription"
-- ]
-- }
-- }
-- }
--query
-- |> stringIn "value"
-- |> objectIn field
-- |> objectIn "prefix"
-- |> objectIn "query"
-- |> Json.Encode.object
-- |> Http.jsonBody
--
-- Wildcard Query
-- {
-- "query": {
-- "wildcard": {
-- "<field>": {
-- "value": "*<value>*",
-- }
-- }
-- }
-- }
("*" ++ query ++ "*")
|> stringIn "value"
|> objectIn field
|> objectIn "wildcard"
|> objectIn "query"
|> List.append
Http.jsonBody
(Json.Encode.object
[ ( "from", Json.Encode.int from )
, ( "size", Json.Encode.int size )
, ( "query"
, Json.Encode.object
[ ( "multi_match"
, Json.Encode.object
[ ( "query", Json.Encode.string query )
, ( "fields"
, Json.Encode.list Json.Encode.string
[ "attr_name.raw"
, "attr_name"
, "pname"
, "pversion"
, "description"
, "longDescription"
]
)
]
)
]
)
]
|> Json.Encode.object
|> Http.jsonBody
)
makeRequest :

View file

@ -53,8 +53,8 @@ type alias Model =
type alias ResultItemSource =
{ attr_name : String
, name : String
, version : String
, pname : String
, pversion : String
, description : Maybe String
, longDescription : Maybe String
, licenses : List ResultPackageLicense
@ -163,8 +163,8 @@ viewResultItem showDetailsFor item =
in
tr [ onClick (SearchMsg (ElasticSearch.ShowDetails item.id)) ]
[ td [] [ text item.source.attr_name ]
, td [] [ text item.source.name ]
, td [] [ text item.source.version ]
, td [] [ text item.source.pname ]
, td [] [ text item.source.pversion ]
, td [] [ text <| Maybe.withDefault "" item.source.description ]
]
:: packageDetails
@ -304,8 +304,8 @@ decodeResultItemSource : Json.Decode.Decoder ResultItemSource
decodeResultItemSource =
Json.Decode.succeed ResultItemSource
|> Json.Decode.Pipeline.required "attr_name" Json.Decode.string
|> Json.Decode.Pipeline.required "name" Json.Decode.string
|> Json.Decode.Pipeline.required "version" Json.Decode.string
|> Json.Decode.Pipeline.required "pname" Json.Decode.string
|> Json.Decode.Pipeline.required "pversion" Json.Decode.string
|> Json.Decode.Pipeline.required "description" (Json.Decode.nullable Json.Decode.string)
|> Json.Decode.Pipeline.required "longDescription" (Json.Decode.nullable Json.Decode.string)
|> Json.Decode.Pipeline.required "license" (Json.Decode.list decodeResultPackageLicense)