aux-search/scripts/import-channel

#! /usr/bin/env nix-shell
#! nix-shell -i python3 -p python3 python3Packages.requests python3Packages.click python3Packages.click-log python3Packages.elasticsearch python3Packages.boto3 python3Packages.tqdm

# develop:
# $ nix-shell -p python3Packages.black python3Packages.mypy python3Packages.flake8
#
# format:
# $ nix-shell -p python3Packages.black --command "black import-channel"
#
# lint:
# $ nix-shell -p python3Packages.flake8 --command "flake8 --ignore E501,E265 import-channel"

import boto3
import click
import logging
import click_log
import elasticsearch
import elasticsearch.helpers
import requests
import json
import os.path
import shlex
import subprocess
import tqdm
import botocore.client
import botocore

logger = logging.getLogger("import-channel")
click_log.basic_config(logger)


CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
INDEX_SCHEMA_VERSION = 3
ANALYSIS = {
    "analyzer": {
        "nixAttrName": {
            "type": "custom",
            "tokenizer": "nix_attrname",
            "filter": ["lowercase", "nix_stopwords"],
        },
        "nixOptionName": {
            "type": "custom",
            "tokenizer": "nix_option_name",
            "filter": ["lowercase"],
        },
        "nixOptionNameGranular": {
            "type": "custom",
            "tokenizer": "nix_option_name_granular",
            "filter": ["lowercase"],
        },
    },
    "tokenizer": {
        "nix_attrname": {
            "type": "pattern",
            # Split on attrname separators like _, .
            "pattern": "|".join(
                [
                    "[_.-]",  # Common separators like underscores, dots and dashes
                    "\\d+?Packages",  # python37Packages -> python
                    # Camelcase tokenizer adapted from
                    # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html
                    "".join(
                        [
                            "(?<=[\\p{L}&&[^\\p{Lu}]])"  # lower case
                            "(?=\\p{Lu})",  # followed by upper case
                            "|",
                            "(?<=\\p{Lu})"  # or upper case
                            "(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])",  # followed by lower case
                        ]
                    ),
                ]
            ),
        },
        "nix_option_name": {
            "type": "pattern",
            "pattern": "[.]",
        },
        # Lower priority (virtualHost -> [virtual, host])
        "nix_option_name_granular": {
            "type": "pattern",
            # Split on attrname separators like _, .
            "pattern": "|".join(
                [
                    "[_.-]",  # Common separators like underscores, dots and dashes
                    # Camelcase tokenizer adapted from
                    # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html
                    "".join(
                        [
                            "(?<=[\\p{L}&&[^\\p{Lu}]])"  # lower case
                            "(?=\\p{Lu})",  # followed by upper case
                            "|",
                            "(?<=\\p{Lu})"  # or upper case
                            "(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])",  # followed by lower case
                        ]
                    ),
                ]
            ),
        },
    },
    "filter": {
        "nix_stopwords": {
            "type": "stop",
            "ignore_case": True,
            "stopwords": ["packages", "package", "options", "option"],
        },
    },
}
MAPPING = {
    "properties": {
        "type": {"type": "keyword"},
        # Package fields
        "package_hydra_build_id": {"type": "keyword"},
        "package_hydra_build_status": {"type": "keyword"},
        "package_hydra_project": {"type": "keyword"},
        "package_hydra_job": {"type": "keyword"},
        "package_hydra_jobset": {"type": "keyword"},
        "package_hydra_path": {
            "type": "nested",
            "properties": {
                "output": {"type": "keyword"},
                "path": {"type": "keyword"}
            }
        },
        "package_hydra_drvpath": {"type": "keyword"},
        "package_attr_name": {
            "type": "text",
            "analyzer": "nixAttrName",
            "fields": {"raw": {"type": "keyword"}},
        },
        "package_attr_set": {"type": "keyword"},
        "package_pname": {"type": "keyword"},
        "package_pversion": {"type": "keyword"},
        "package_description": {"type": "text"},
        "package_longDescription": {"type": "text"},
        "package_license": {
            "type": "nested",
            "properties": {"fullName": {"type": "text"}, "url": {"type": "text"}},
        },
        "package_maintainers": {
            "type": "nested",
            "properties": {
                "name": {"type": "text"},
                "email": {"type": "text"},
                "github": {"type": "text"},
            },
        },
        "package_platforms": {"type": "keyword"},
        "package_position": {"type": "text"},
        "package_homepage": {"type": "keyword"},
        # Options fields
        "option_name": {
            "type": "text",
            "analyzer": "nixOptionName",
            "fielddata": True,
            "fields": {
                "raw": {
                    "type": "keyword"
                },
                "granular": {
                    "type": "text",
                    "analyzer": "nixOptionNameGranular",
                },
            },
        },
        "option_description": {"type": "text"},
        "option_type": {"type": "keyword"},
        "option_default": {"type": "text"},
        "option_example": {"type": "text"},
        "option_source": {"type": "keyword"},
    },
}


def get_last_evaluation(channel):
    logger.debug(f"Retriving last evaluation for {channel} channel")

    project, project_version = channel.split("-", 1)
    logger.debug(f"get_last_evaluation: project='{project}'")
    logger.debug(f"get_last_evaluation: project_version='{project_version}'")

    bucket = "nix-releases"
    prefix = f"{project}/{project_version}/"
    logger.debug(
        f"get_last_evaluation: list all evaluation in '{bucket}' bucker under '{prefix}' prefix"
    )

    s3 = boto3.client(
        "s3", config=botocore.client.Config(signature_version=botocore.UNSIGNED)
    )
    s3_result = s3.list_objects(Bucket=bucket, Prefix=prefix, Delimiter="/",)
    evaluations = []
    for item in s3_result.get("CommonPrefixes"):
        if not item:
            continue
        logger.debug(f"get_last_evaluation: evaluation in raw {item}")
        prefix = item.get("Prefix")
        evaluation = prefix[len(f"{project}/{project_version}/{channel}") :]
        if evaluation.startswith("beta"):
            evaluation = evaluation[len("beta") :]
        try:
            revisions_since_start, git_revision = (
                evaluation.lstrip(".").rstrip("/").split(".")
            )
        except Exception as e:  # noqa
            continue
        evaluation = {
            "revisions_since_start": int(revisions_since_start),
            "git_revision": git_revision,
            "prefix": prefix,
        }
        logger.debug(f"get_last_evaluation: evaluation {evaluation}")
        evaluations.append(evaluation)

    logger.debug(
        f"get_last_evaluation: {len(evaluations)} evaluations found for {channel} channel"
    )
    evaluations = sorted(evaluations, key=lambda i: i["revisions_since_start"])

    evaluation = evaluations[-1]

    result = s3.get_object(Bucket=bucket, Key=f"{evaluation['prefix']}src-url")
    evaluation['id'] = result.get("Body").read().decode()[len("https://hydra.nixos.org/eval/"):]

    logger.debug(f"get_last_evaluation: last evaluation is: {evaluation}")

    return evaluation


def get_evaluation_builds(evaluation_id):
    logger.debug(f"get_evaluation_builds: Retriving list of builds for {evaluation_id} evaluation id")
    filename = f"eval-{evaluation_id}.json"
    if not os.path.exists(filename):
        url = f"https://hydra.nixos.org/eval/{evaluation_id}/builds"
        logger.debug(f"get_evaluation_builds: Fetching builds from {url} url.")
        headers = {
            "Content-Type": "application/json"
        }
        r = requests.get(url, headers=headers, stream=True)
        with tqdm.tqdm.wrapattr(
            open(filename, "wb"),
            "write",
            miniters=1,
            total=int(r.headers.get('content-length', 0)),
            desc=filename
        ) as f:
            for chunk in r.iter_content(chunk_size=4096):
                f.write(chunk)

    with open(filename) as f:
        builds = json.loads(f.read())

    return {
        f"{build['nixname']}.{build['system']}": build
        for build in builds
    }


def get_packages(evaluation, evaluation_builds):
    logger.debug(
        f"get_packages: Retriving list of packages for '{evaluation['git_revision']}' revision"
    )
    result = subprocess.run(
        shlex.split(
            f"nix-env -f '<nixpkgs>' -I nixpkgs=https://github.com/NixOS/nixpkgs-channels/archive/{evaluation['git_revision']}.tar.gz --arg config 'import {CURRENT_DIR}/packages-config.nix' -qa --json"
        ),
        stdout=subprocess.PIPE,
        check=True,
    )
    packages = json.loads(result.stdout).items()
    packages = list(packages)

    def gen():
        for attr_name, data in packages:
            position = data["meta"].get("position")
            if position and position.startswith("/nix/store"):
                position = position[44:]

            licenses = data["meta"].get("license")
            if licenses:
                if type(licenses) == str:
                    licenses = [dict(fullName=licenses)]
                elif type(licenses) == dict:
                    licenses = [licenses]
                licenses = [
                    type(license) == str
                    and dict(fullName=license, url=None)
                    or dict(fullName=license.get("fullName"), url=license.get("url"),)
                    for license in licenses
                ]
            else:
                licenses = []

            maintainers = [
                type(maintainer) == str
                and dict(name=maintainer, email=None, github=None)
                or dict(
                    name=maintainer.get("name"),
                    email=maintainer.get("email"),
                    github=maintainer.get("github"),
                )
                for maintainer in data["meta"].get("maintainers", [])
            ]

            platforms = [
                type(platform) == str and platform or None
                for platform in data["meta"].get("platforms", [])
            ]

            attr_set = None
            if "." in attr_name:
                attr_set = attr_name.split(".")[0]
                if not attr_set.endswith("Packages") and not attr_set.endswith(
                    "Plugins"
                ):
                    attr_set = None

            hydra_build_id = None
            hydra_build_status = None
            hydra_job = None
            hydra_jobset = None
            hydra_path = None
            hydra_drvpath = None
            build_key = f"{data['name']}.{data['system']}"
            if build_key in evaluation_builds:
                build = evaluation_builds[build_key]
                hydra_build_id = build['id']
                hydra_build_status = build['buildstatus']
                hydra_project = build['project']
                hydra_job = build['job']
                hydra_jobset = build['jobset']
                hydra_path = [
                    {
                        "output": output,
                        "path": item['path'],
                    }
                    for output, item in build['buildoutputs'].items()
                ]
                hydra_drvpath = build['drvpath']

            yield dict(
                type="package",
                package_hydra_build_id=hydra_build_id,
                package_hydra_build_status=hydra_build_status,
                package_hydra_project=hydra_project,
                package_hydra_job=hydra_job,
                package_hydra_jobset=hydra_jobset,
                package_hydra_path=hydra_path,
                package_hydra_drvpath=hydra_drvpath,
                package_attr_name=attr_name,
                package_attr_set=attr_set,
                package_pname=data["pname"],
                package_pversion=data["version"],
                package_description=data["meta"].get("description"),
                package_longDescription=data["meta"].get("longDescription", ""),
                package_license=licenses,
                package_maintainers=maintainers,
                package_platforms=[i for i in platforms if i],
                package_position=position,
                package_homepage=data["meta"].get("homepage"),
            )

    logger.debug(f"get_packages: Found {len(packages)} packages")
    return len(packages), gen


def get_options(evaluation):
    result = subprocess.run(
        shlex.split(
            f"nix-build <nixpkgs/nixos/release.nix> --no-out-link -A options -I nixpkgs=https://github.com/NixOS/nixpkgs-channels/archive/{evaluation['git_revision']}.tar.gz"
        ),
        stdout=subprocess.PIPE,
        check=True,
    )
    options = []
    options_file = result.stdout.strip().decode()
    options_file = f"{options_file}/share/doc/nixos/options.json"
    if os.path.exists(options_file):
        with open(options_file) as f:
            options = json.load(f).items()
    options = list(options)

    def gen():
        for name, option in options:
            example = option.get("example")
            if (
                example
                and type(example) == dict
                and example.get("_type") == "literalExample"
            ):
                example = str(example["text"])
            yield dict(
                type="option",
                option_name=name,
                option_description=option.get("description"),
                option_type=option.get("type"),
                option_default=str(option.get("default")),
                option_example=str(example),
                option_source=option.get("declarations", [None])[0],
            )

    return len(options), gen


def ensure_index(es, index, mapping):
    if es.indices.exists(index):
        logger.debug(f"ensure_index: index '{index}' already exists")
        return False

    es.indices.create(
        index=index,
        body={
            "settings": {"number_of_shards": 1, "analysis": ANALYSIS},
            "mappings": mapping,
        },
    )
    logger.debug(f"ensure_index: index '{index}' was created")

    return True


def create_index_name(channel, evaluation):
    return (
        f"latest-{INDEX_SCHEMA_VERSION}-{channel}",
        f"evaluation-{INDEX_SCHEMA_VERSION}-{channel}-{evaluation['id']}-{evaluation['revisions_since_start']}-{evaluation['git_revision']}",
    )


def update_alias(es, name, index):
    if es.indices.exists_alias(name=name):
        indexes = set(es.indices.get_alias(name=name).keys())

        # indexes to remove from alias
        actions = [
            {"remove": {"index": item, "alias": name}}
            for item in indexes.difference(set([index]))
        ]

        # add index if does not exists in alias
        if index not in indexes:
            actions.append({"add": {"index": index, "alias": name}})

        if actions:
            es.indices.update_aliases({"actions": actions})
    else:
        es.indices.put_alias(index=index, name=name)

    indexes = ", ".join(es.indices.get_alias(name=name).keys())
    logger.debug(f"'{name}' alias now points to '{indexes}' index")


def write(unit, es, index_name, number_of_items, item_generator):
    if number_of_items:
        click.echo(f"Indexing {unit}...")
        progress = tqdm.tqdm(unit=unit, total=number_of_items)
        successes = 0
        for ok, action in elasticsearch.helpers.streaming_bulk(
            client=es, index=index_name, actions=item_generator()
        ):
            progress.update(1)
            successes += ok
        click.echo(f"Indexed {successes}/{number_of_items} {unit}")


@click.command()
@click.option("-u", "--es-url", help="Elasticsearch connection url")
@click.option("-c", "--channel", help="NixOS channel name")
@click.option("-v", "--verbose", count=True)
def main(es_url, channel, verbose):

    logging_level = "CRITICAL"
    if verbose == 1:
        logging_level = "WARNING"
    elif verbose >= 2:
        logging_level = "DEBUG"

    logger.setLevel(getattr(logging, logging_level))
    logger.debug(f"Verbosity is {verbose}")
    logger.debug(f"Logging set to {logging_level}")

    evaluation = get_last_evaluation(channel)
    evaluation_builds = get_evaluation_builds(evaluation['id'])

    es = elasticsearch.Elasticsearch([es_url])

    # ensure indexes exist
    alias_name, index_name = create_index_name(channel, evaluation)
    index_created = ensure_index(es, index_name, MAPPING)

    if index_created:
        write("packages", es, index_name, *get_packages(evaluation, evaluation_builds))
        write("options", es, index_name, *get_options(evaluation))

    update_alias(es, alias_name, index_name)


if __name__ == "__main__":
    main()

# vi:ft=python