aux-search/scripts/import-channels-into-elasticsearch

#! /usr/bin/env nix-shell
#! nix-shell -i python3 -p python3 python3Packages.click python3Packages.elasticsearch python3Packages.boto3 python3Packages.tqdm


import boto3
import click
import elasticsearch
import elasticsearch.helpers
import json
import os.path
import shlex
import subprocess
import tqdm
import botocore.client
import botocore


CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))


ANALYSIS = {
    'analyzer': {
        'nixAttrName': {
            'type': 'custom',
            'tokenizer': 'nix_attrname',
            'filter': ['lowercase', 'nix_stopwords'],
        },
    },
    'tokenizer': {
        'nix_attrname': {
            'type': 'pattern',
            # Split on attrname separators like _, .
            'pattern': "|".join([
                '[_.-]',  # Common separators like underscores, dots and dashes
                '\\d+?Packages',  # python37Packages -> python
                # Camelcase tokenizer adapted from
                # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html
                "".join([
                    '(?<=[\\p{L}&&[^\\p{Lu}]])'  # lower case
                    '(?=\\p{Lu})',  # followed by upper case
                    '|',
                    '(?<=\\p{Lu})'  # or upper case
                    '(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])',  # followed by lower case
                ])
            ])
        },
    },
    'filter': {
        'nix_stopwords': {
            'type': 'stop',
            'ignore_case': True,
            'stopwords': ['packages', 'package', 'options', 'option'],
        },
    },
}


def get_last_evaluation(channel):
    project, project_version = channel.split("-", 1)
    s3 = boto3.client("s3", config=botocore.client.Config(signature_version=botocore.UNSIGNED))
    s3_result = s3.list_objects(
        Bucket="nix-releases",
        Prefix=f"{project}/{project_version}/",
        Delimiter="/",
    )
    evaluations = []
    for item in s3_result.get("CommonPrefixes"):
        if not item:
            continue
        prefix = item.get("Prefix")
        evaluation = prefix[len(f"{project}/{project_version}/{channel}"):]
        if evaluation.startswith("beta"):
            evaluation = evaluation[len("beta"):]
        try:
            revisions_since_start, git_revision = evaluation.lstrip(".").rstrip("/").split(".")
        except:
            continue
        evaluations.append(dict(
            revisions_since_start=int(revisions_since_start),
            git_revision=git_revision,
            prefix=prefix,
        ))

    evaluations = sorted(evaluations, key=lambda i: i["revisions_since_start"])
    return evaluations[-1]


def get_packages(evaluation):
    result = subprocess.run(
        shlex.split(f"nix-env -f '<nixpkgs>' -I nixpkgs=https://github.com/NixOS/nixpkgs-channels/archive/{evaluation['git_revision']}.tar.gz --arg config 'import {CURRENT_DIR}/packages-config.nix' -qa --json"),
        stdout=subprocess.PIPE,
        check=True,
    )
    packages = json.loads(result.stdout).items()
    packages = list(packages)

    def gen():
        for attr_name, data in packages:
            position = data["meta"].get("position")
            if position and position.startswith("/nix/store"):
                position = position[44:]
            licenses = data["meta"].get("license")
            if licenses:
                if type(licenses) == str:
                    licenses = [dict(fullName=licenses)]
                elif type(licenses) == dict:
                    licenses = [licenses]
                licenses = [
                    type(license) == str
                    and dict(fullName=license, url=None)
                    or dict(
                        fullName=license.get("fullName"),
                        url=license.get("url"),
                    )
                    for license in licenses
                ]
            else:
                licenses = []
            maintainers = [
                type(maintainer) == str
                and dict(name=maintainer, email=None, github=None)
                or dict(
                    name=maintainer.get("name"),
                    email=maintainer.get("email"),
                    github=maintainer.get("github"),
                )
                for maintainer in data["meta"].get("maintainers", [])
            ]
            platforms = [
                type(platform) == str
                and platform
                or None
                for platform in data["meta"].get("platforms", [])
            ]
            yield dict(
                id=attr_name,
                attr_name=attr_name,
                pname=data["pname"],
                pversion=data["version"],
                description=data["meta"].get("description"),
                longDescription=data["meta"].get("longDescription", ""),
                license=licenses,
                maintainers=maintainers,
                platforms=[i for i in platforms if i],
                position=position,
                homepage=data["meta"].get("homepage"),
            )

    return len(packages), gen


def get_options(evaluation):
    result = subprocess.run(
        shlex.split(f"nix-build <nixpkgs/nixos/release.nix> --no-out-link -A options -I nixpkgs=https://github.com/NixOS/nixpkgs-channels/archive/{evaluation['git_revision']}.tar.gz"),
        stdout=subprocess.PIPE,
        check=True,
    )
    options = []
    options_file = result.stdout.strip().decode()
    options_file = f"{options_file}/share/doc/nixos/options.json"
    if os.path.exists(options_file):
        with open(options_file) as f:
            options = json.load(f).items()
    options = list(options)

    def gen():
        for name, option in options:
            example = option.get("example")
            if example and \
                    type(example) == dict and \
                    example.get("_type") == "literalExample":
                example = str(example["text"])
            yield dict(
                id=name,
                option_name=name,
                description=option.get("description"),
                type=option.get("type"),
                default=str(option.get("default")),
                example=str(example),
                source=option.get("declarations", [None])[0],
            )

    return len(options), gen


def recreate_index(es, channel):
    if es.indices.exists(f"{channel}-packages"):
        es.indices.delete(index=f"{channel}-packages")
    es.indices.create(
        index=f"{channel}-packages",
        body=dict(
            settings=dict(number_of_shards=1, analysis=ANALYSIS),
            mappings=dict(
                properties=dict(
                    attr_name=dict(
                        type="text",
                        analyzer="nixAttrName",
                        fields={
                            "raw": {
                                "type": "keyword",
                            }
                        },
                    ),
                    pname=dict(type="keyword"),
                    pversion=dict(type="text"),
                    description=dict(type="text"),
                    longDescription=dict(type="text"),
                    license=dict(
                        type="nested",
                        properties=dict(
                            fullName=dict(type="text"),
                            url=dict(type="text"),
                        ),
                    ),
                    maintainers=dict(
                        type="nested",
                        properties=dict(
                            name=dict(type="text"),
                            email=dict(type="text"),
                            github=dict(type="text"),
                        ),
                    ),
                    platforms=dict(type="keyword"),
                    position=dict(type="text"),
                    homepage=dict(type="keyword"),
                ),
            ),
        ),
    )
    if es.indices.exists(f"{channel}-options"):
        es.indices.delete(index=f"{channel}-options")
    es.indices.create(
        index=f"{channel}-options",
        body=dict(
            settings=dict(number_of_shards=1, analysis=ANALYSIS),
            mappings=dict(
                properties=dict(
                    option_name=dict(type="keyword"),
                    description=dict(type="text"),
                    type=dict(type="keyword"),
                    default=dict(type="text"),
                    example=dict(type="text"),
                    source=dict(type="keyword"),
                ),
            ),
        ),
    )


@click.command()
@click.option("--es-url", help="Elasticsearch connection url")
@click.option("--channel")
def main(es_url, channel):
    evaluation = get_last_evaluation(channel)
    es = elasticsearch.Elasticsearch([es_url])
    recreate_index(es, channel)

    # write packages
    number_of_packages, gen_packages = get_packages(evaluation)
    packages = list(gen_packages())
    if number_of_packages:
        click.echo("Indexing packages...")
        progress = tqdm.tqdm(unit="packages", total=number_of_packages)
        successes = 0
        for ok, action in elasticsearch.helpers.streaming_bulk(
                client=es,
                index=f"{channel}-packages",
                actions=gen_packages()):
            progress.update(1)
            successes += ok
        print("Indexed %d/%d packages" % (successes, number_of_packages))

    # write options
    number_of_options, gen_options = get_options(evaluation)
    options = list(gen_options())
    if number_of_options:
        click.echo("Indexing options...")
        progress = tqdm.tqdm(unit="options", total=number_of_options)
        successes = 0
        for ok, action in elasticsearch.helpers.streaming_bulk(
                client=es,
                index=f"{channel}-options",
                actions=gen_options()):
            progress.update(1)
            successes += ok
        print("Indexed %d/%d options" % (successes, number_of_options))


if __name__ == "__main__":
    main()
initial version of import script 2020-03-28 01:34:38 +00:00			`#! /usr/bin/env nix-shell`
			`#! nix-shell -i python3 -p python3 python3Packages.click python3Packages.elasticsearch python3Packages.boto3 python3Packages.tqdm`


			`import boto3`
			`import click`
			`import elasticsearch`
			`import elasticsearch.helpers`
			`import json`
			`import os.path`
			`import shlex`
			`import subprocess`
			`import tqdm`
Use a custom attrname analyzer (#35) * Use unsigned boto s3 requests, Without this change you need s3 credentials, even though the bucket is public * Use custom attrname analyzer * Adapt query to new schema Use pname/pversion to not clash with elasticsearch parsing of version 2020-05-19 10:54:48 +00:00			`import botocore.client`
			`import botocore`

initial version of import script 2020-03-28 01:34:38 +00:00

			`CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))`


Use a custom attrname analyzer (#35) * Use unsigned boto s3 requests, Without this change you need s3 credentials, even though the bucket is public * Use custom attrname analyzer * Adapt query to new schema Use pname/pversion to not clash with elasticsearch parsing of version 2020-05-19 10:54:48 +00:00			`ANALYSIS = {`
			`'analyzer': {`
			`'nixAttrName': {`
			`'type': 'custom',`
			`'tokenizer': 'nix_attrname',`
			`'filter': ['lowercase', 'nix_stopwords'],`
			`},`
			`},`
			`'tokenizer': {`
			`'nix_attrname': {`
			`'type': 'pattern',`
			`# Split on attrname separators like _, .`
			`'pattern': "\|".join([`
			`'[_.-]', # Common separators like underscores, dots and dashes`
			`'\\d+?Packages', # python37Packages -> python`
			`# Camelcase tokenizer adapted from`
			`# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html`
			`"".join([`
			`'(?<=[\\p{L}&&[^\\p{Lu}]])' # lower case`
			`'(?=\\p{Lu})', # followed by upper case`
			`'\|',`
			`'(?<=\\p{Lu})' # or upper case`
			`'(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])', # followed by lower case`
			`])`
			`])`
			`},`
			`},`
			`'filter': {`
			`'nix_stopwords': {`
			`'type': 'stop',`
			`'ignore_case': True,`
			`'stopwords': ['packages', 'package', 'options', 'option'],`
			`},`
			`},`
			`}`


initial version of import script 2020-03-28 01:34:38 +00:00			`def get_last_evaluation(channel):`
			`project, project_version = channel.split("-", 1)`
Use a custom attrname analyzer (#35) * Use unsigned boto s3 requests, Without this change you need s3 credentials, even though the bucket is public * Use custom attrname analyzer * Adapt query to new schema Use pname/pversion to not clash with elasticsearch parsing of version 2020-05-19 10:54:48 +00:00			`s3 = boto3.client("s3", config=botocore.client.Config(signature_version=botocore.UNSIGNED))`
initial version of import script 2020-03-28 01:34:38 +00:00			`s3_result = s3.list_objects(`
			`Bucket="nix-releases",`
			`Prefix=f"{project}/{project_version}/",`
			`Delimiter="/",`
			`)`
			`evaluations = []`
			`for item in s3_result.get("CommonPrefixes"):`
intial version of elm+webpack setup 2020-03-28 04:09:01 +00:00			`if not item:`
initial version of import script 2020-03-28 01:34:38 +00:00			`continue`
			`prefix = item.get("Prefix")`
			`evaluation = prefix[len(f"{project}/{project_version}/{channel}"):]`
			`if evaluation.startswith("beta"):`
			`evaluation = evaluation[len("beta"):]`
intial version of elm+webpack setup 2020-03-28 04:09:01 +00:00			`try:`
			`revisions_since_start, git_revision = evaluation.lstrip(".").rstrip("/").split(".")`
			`except:`
			`continue`
initial version of import script 2020-03-28 01:34:38 +00:00			`evaluations.append(dict(`
			`revisions_since_start=int(revisions_since_start),`
			`git_revision=git_revision,`
			`prefix=prefix,`
			`))`

			`evaluations = sorted(evaluations, key=lambda i: i["revisions_since_start"])`
			`return evaluations[-1]`


			`def get_packages(evaluation):`
			`result = subprocess.run(`
			`shlex.split(f"nix-env -f '<nixpkgs>' -I nixpkgs=https://github.com/NixOS/nixpkgs-channels/archive/{evaluation['git_revision']}.tar.gz --arg config 'import {CURRENT_DIR}/packages-config.nix' -qa --json"),`
			`stdout=subprocess.PIPE,`
			`check=True,`
			`)`
			`packages = json.loads(result.stdout).items()`
somehow in a working starte with bugs and missing features 2020-05-08 13:24:58 +00:00			`packages = list(packages)`
initial version of import script 2020-03-28 01:34:38 +00:00
			`def gen():`
			`for attr_name, data in packages:`
			`position = data["meta"].get("position")`
			`if position and position.startswith("/nix/store"):`
			`position = position[44:]`
			`licenses = data["meta"].get("license")`
			`if licenses:`
			`if type(licenses) == str:`
Use a custom attrname analyzer (#35) * Use unsigned boto s3 requests, Without this change you need s3 credentials, even though the bucket is public * Use custom attrname analyzer * Adapt query to new schema Use pname/pversion to not clash with elasticsearch parsing of version 2020-05-19 10:54:48 +00:00			`licenses = [dict(fullName=licenses)]`
initial version of import script 2020-03-28 01:34:38 +00:00			`elif type(licenses) == dict:`
Use a custom attrname analyzer (#35) * Use unsigned boto s3 requests, Without this change you need s3 credentials, even though the bucket is public * Use custom attrname analyzer * Adapt query to new schema Use pname/pversion to not clash with elasticsearch parsing of version 2020-05-19 10:54:48 +00:00			`licenses = [licenses]`
initial version of import script 2020-03-28 01:34:38 +00:00			`licenses = [`
			`type(license) == str`
			`and dict(fullName=license, url=None)`
			`or dict(`
			`fullName=license.get("fullName"),`
			`url=license.get("url"),`
			`)`
			`for license in licenses`
			`]`
make a request to elasticsearch, nothing fancy 2020-04-07 05:05:50 +00:00			`else:`
			`licenses = []`
initial version of import script 2020-03-28 01:34:38 +00:00			`maintainers = [`
			`type(maintainer) == str`
			`and dict(name=maintainer, email=None, github=None)`
			`or dict(`
			`name=maintainer.get("name"),`
			`email=maintainer.get("email"),`
			`github=maintainer.get("github"),`
			`)`
			`for maintainer in data["meta"].get("maintainers", [])`
			`]`
add style to the search result 2020-04-10 08:13:50 +00:00			`platforms = [`
			`type(platform) == str`
			`and platform`
			`or None`
			`for platform in data["meta"].get("platforms", [])`
			`]`
initial version of import script 2020-03-28 01:34:38 +00:00			`yield dict(`
add style to the search result 2020-04-10 08:13:50 +00:00			`id=attr_name,`
initial version of import script 2020-03-28 01:34:38 +00:00			`attr_name=attr_name,`
Use a custom attrname analyzer (#35) * Use unsigned boto s3 requests, Without this change you need s3 credentials, even though the bucket is public * Use custom attrname analyzer * Adapt query to new schema Use pname/pversion to not clash with elasticsearch parsing of version 2020-05-19 10:54:48 +00:00			`pname=data["pname"],`
			`pversion=data["version"],`
initial version of import script 2020-03-28 01:34:38 +00:00			`description=data["meta"].get("description"),`
			`longDescription=data["meta"].get("longDescription", ""),`
			`license=licenses,`
			`maintainers=maintainers,`
add style to the search result 2020-04-10 08:13:50 +00:00			`platforms=[i for i in platforms if i],`
initial version of import script 2020-03-28 01:34:38 +00:00			`position=position,`
			`homepage=data["meta"].get("homepage"),`
			`)`

			`return len(packages), gen`


			`def get_options(evaluation):`
			`result = subprocess.run(`
			`shlex.split(f"nix-build <nixpkgs/nixos/release.nix> --no-out-link -A options -I nixpkgs=https://github.com/NixOS/nixpkgs-channels/archive/{evaluation['git_revision']}.tar.gz"),`
			`stdout=subprocess.PIPE,`
			`check=True,`
			`)`
			`options = []`
			`options_file = result.stdout.strip().decode()`
			`options_file = f"{options_file}/share/doc/nixos/options.json"`
			`if os.path.exists(options_file):`
			`with open(options_file) as f:`
			`options = json.load(f).items()`
somehow in a working starte with bugs and missing features 2020-05-08 13:24:58 +00:00			`options = list(options)`
initial version of import script 2020-03-28 01:34:38 +00:00
			`def gen():`
			`for name, option in options:`
			`example = option.get("example")`
			`if example and \`
			`type(example) == dict and \`
			`example.get("_type") == "literalExample":`
			`example = str(example["text"])`
			`yield dict(`
add style to the search result 2020-04-10 08:13:50 +00:00			`id=name,`
initial version of import script 2020-03-28 01:34:38 +00:00			`option_name=name,`
			`description=option.get("description"),`
			`type=option.get("type"),`
			`default=str(option.get("default")),`
			`example=str(example),`
			`source=option.get("declarations", [None])[0],`
			`)`

			`return len(options), gen`

add style to the search result 2020-04-10 08:13:50 +00:00
initial version of import script 2020-03-28 01:34:38 +00:00			`def recreate_index(es, channel):`
			`if es.indices.exists(f"{channel}-packages"):`
			`es.indices.delete(index=f"{channel}-packages")`
			`es.indices.create(`
			`index=f"{channel}-packages",`
			`body=dict(`
Use a custom attrname analyzer (#35) * Use unsigned boto s3 requests, Without this change you need s3 credentials, even though the bucket is public * Use custom attrname analyzer * Adapt query to new schema Use pname/pversion to not clash with elasticsearch parsing of version 2020-05-19 10:54:48 +00:00			`settings=dict(number_of_shards=1, analysis=ANALYSIS),`
initial version of import script 2020-03-28 01:34:38 +00:00			`mappings=dict(`
			`properties=dict(`
Use a custom attrname analyzer (#35) * Use unsigned boto s3 requests, Without this change you need s3 credentials, even though the bucket is public * Use custom attrname analyzer * Adapt query to new schema Use pname/pversion to not clash with elasticsearch parsing of version 2020-05-19 10:54:48 +00:00			`attr_name=dict(`
			`type="text",`
			`analyzer="nixAttrName",`
			`fields={`
			`"raw": {`
			`"type": "keyword",`
			`}`
			`},`
			`),`
			`pname=dict(type="keyword"),`
			`pversion=dict(type="text"),`
initial version of import script 2020-03-28 01:34:38 +00:00			`description=dict(type="text"),`
			`longDescription=dict(type="text"),`
			`license=dict(`
			`type="nested",`
			`properties=dict(`
			`fullName=dict(type="text"),`
			`url=dict(type="text"),`
			`),`
			`),`
			`maintainers=dict(`
			`type="nested",`
			`properties=dict(`
			`name=dict(type="text"),`
			`email=dict(type="text"),`
			`github=dict(type="text"),`
			`),`
			`),`
add style to the search result 2020-04-10 08:13:50 +00:00			`platforms=dict(type="keyword"),`
initial version of import script 2020-03-28 01:34:38 +00:00			`position=dict(type="text"),`
import name as keyword (#30) otherwise elasticsearch will performs stemming which removes important information: Example: Name: "nixpkgs-review" Elasticsearch with 'text' mapping: ["nixpkgs", "review"] Now searching for nixpkgs-review will return no results. 2020-05-14 15:26:56 +00:00			`homepage=dict(type="keyword"),`
initial version of import script 2020-03-28 01:34:38 +00:00			`),`
			`),`
			`),`
			`)`
			`if es.indices.exists(f"{channel}-options"):`
			`es.indices.delete(index=f"{channel}-options")`
			`es.indices.create(`
			`index=f"{channel}-options",`
			`body=dict(`
Use a custom attrname analyzer (#35) * Use unsigned boto s3 requests, Without this change you need s3 credentials, even though the bucket is public * Use custom attrname analyzer * Adapt query to new schema Use pname/pversion to not clash with elasticsearch parsing of version 2020-05-19 10:54:48 +00:00			`settings=dict(number_of_shards=1, analysis=ANALYSIS),`
initial version of import script 2020-03-28 01:34:38 +00:00			`mappings=dict(`
			`properties=dict(`
index as keywords (#24) fixes #22 2020-05-11 21:33:25 +00:00			`option_name=dict(type="keyword"),`
initial version of import script 2020-03-28 01:34:38 +00:00			`description=dict(type="text"),`
			`type=dict(type="keyword"),`
			`default=dict(type="text"),`
			`example=dict(type="text"),`
			`source=dict(type="keyword"),`
			`),`
			`),`
			`),`
			`)`


			`@click.command()`
			`@click.option("--es-url", help="Elasticsearch connection url")`
			`@click.option("--channel")`
			`def main(es_url, channel):`
			`evaluation = get_last_evaluation(channel)`
			`es = elasticsearch.Elasticsearch([es_url])`
			`recreate_index(es, channel)`

			`# write packages`
			`number_of_packages, gen_packages = get_packages(evaluation)`
			`packages = list(gen_packages())`
			`if number_of_packages:`
			`click.echo("Indexing packages...")`
			`progress = tqdm.tqdm(unit="packages", total=number_of_packages)`
			`successes = 0`
			`for ok, action in elasticsearch.helpers.streaming_bulk(`
			`client=es,`
			`index=f"{channel}-packages",`
			`actions=gen_packages()):`
			`progress.update(1)`
			`successes += ok`
			`print("Indexed %d/%d packages" % (successes, number_of_packages))`

			`# write options`
			`number_of_options, gen_options = get_options(evaluation)`
			`options = list(gen_options())`
			`if number_of_options:`
			`click.echo("Indexing options...")`
			`progress = tqdm.tqdm(unit="options", total=number_of_options)`
			`successes = 0`
			`for ok, action in elasticsearch.helpers.streaming_bulk(`
			`client=es,`
			`index=f"{channel}-options",`
			`actions=gen_options()):`
			`progress.update(1)`
			`successes += ok`
			`print("Indexed %d/%d options" % (successes, number_of_options))`


			`if __name__ == "__main__":`
			`main()`