diff --git a/.github/workflows/build_openaddresses.yml b/.github/workflows/build_openaddresses.yml new file mode 100644 index 0000000..feecdf2 --- /dev/null +++ b/.github/workflows/build_openaddresses.yml @@ -0,0 +1,53 @@ +# This workflow will build a docker container, publish it to Azure Container Registry, and deploy it to Azure Kubernetes Service using a helm chart. +# +# https://github.com/Azure/actions-workflow-samples/tree/master/Kubernetes +# +# To configure this workflow: +# +# 1. Set up the following secrets in your workspace: +# a. REGISTRY_USERNAME with ACR username +# b. REGISTRY_PASSWORD with ACR Password +# c. AZURE_CREDENTIALS with the output of `az ad sp create-for-rbac --sdk-auth` +# +# 2. Change the values for the REGISTRY_NAME, CLUSTER_NAME, CLUSTER_RESOURCE_GROUP and NAMESPACE environment variables (below). +name: build_openaddresses +on: [pull_request] + +# Environment variables available to all jobs and steps in this workflow +env: + REGISTRY_NAME: k8scc01covidacr + CLUSTER_NAME: k8s-cancentral-02-covid-aks + CLUSTER_RESOURCE_GROUP: k8s-cancentral-01-covid-aks +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@master + + # Connect to Azure Container registry (ACR) + - uses: azure/docker-login@v1 + with: + login-server: ${{ env.REGISTRY_NAME }}.azurecr.io + username: ${{ secrets.REGISTRY_USERNAME }} + password: ${{ secrets.REGISTRY_PASSWORD }} + + - name: Free disk space + run: | + sudo swapoff -a + sudo rm -f /swapfile + sudo apt clean + docker rmi $(docker image ls -aq) + df -h + + - run: | + docker build -f ./openaddresses-batch-machine/container/Dockerfile -t ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} ./openaddresses-batch-machine/container + docker tag ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:latest + + # Scan image for vulnerabilities + - uses: Azure/container-scan@v0 + with: + image-name: ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} + severity-threshold: CRITICAL + run-quality-checks: false + +# Insignificant change to tigger a build. Delete this line if found diff --git a/.github/workflows/publish_openaddresses.yml b/.github/workflows/publish_openaddresses.yml new file mode 100644 index 0000000..94f4078 --- /dev/null +++ b/.github/workflows/publish_openaddresses.yml @@ -0,0 +1,57 @@ +# This workflow will build a docker container, publish it to Azure Container Registry, and deploy it to Azure Kubernetes Service using a helm chart. +# +# https://github.com/Azure/actions-workflow-samples/tree/master/Kubernetes +# +# To configure this workflow: +# +# 1. Set up the following secrets in your workspace: +# a. REGISTRY_USERNAME with ACR username +# b. REGISTRY_PASSWORD with ACR Password +# c. AZURE_CREDENTIALS with the output of `az ad sp create-for-rbac --sdk-auth` +# +# 2. Change the values for the REGISTRY_NAME, CLUSTER_NAME, CLUSTER_RESOURCE_GROUP and NAMESPACE environment variables (below). +name: publish_openaddresses +on: + push: + branches: + - master + +# Environment variables available to all jobs and steps in this workflow +env: + REGISTRY_NAME: k8scc01covidacr + CLUSTER_NAME: k8s-cancentral-02-covid-aks + CLUSTER_RESOURCE_GROUP: k8s-cancentral-01-covid-aks +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@master + + # Connect to Azure Container registry (ACR) + - uses: azure/docker-login@v1 + with: + login-server: ${{ env.REGISTRY_NAME }}.azurecr.io + username: ${{ secrets.REGISTRY_USERNAME }} + password: ${{ secrets.REGISTRY_PASSWORD }} + + - name: Free disk space + run: | + sudo swapoff -a + sudo rm -f /swapfile + sudo apt clean + docker rmi $(docker image ls -aq) + df -h + + # Container build and push to a Azure Container registry (ACR) + - run: | + docker build -f ./openaddresses-batch-machine/container/Dockerfile -t ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} ./openaddresses-batch-machine/container + docker tag ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:latest + docker push ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} + docker push ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:latest + + # Scan image for vulnerabilities + - uses: Azure/container-scan@v0 + with: + image-name: ${{ env.REGISTRY_NAME }}.azurecr.io/daaas-openaddresses-batch-machine:${{ github.sha }} + severity-threshold: CRITICAL + run-quality-checks: false diff --git a/openaddresses-batch-machine/README.md b/openaddresses-batch-machine/README.md new file mode 100644 index 0000000..04e8868 --- /dev/null +++ b/openaddresses-batch-machine/README.md @@ -0,0 +1,9 @@ +# Summary + +This repo builds and provides access to [this modification](https://github.com/JosephKuchar/batch-machine) of [OpenAddresses batch-machine](https://github.com/openaddresses/batch-machine). The container is built is similar to that specified by the OpenAddresses repo, but is pinned to a specific commit from JosephKuchar/batch-machine and restricts the user to be non-ROOT. + +# Usage: + +See `../.github/workflows/build_openaddresses.yml` (or `publish_openaddresses.yml`) for CI/build details, which build `./container/Dockerfile` + +See `./pipeline/get_openaddresses_data.ipynb` for example usage. Typically, the easiest way to invoke this is through a Kubeflow Pipeline. diff --git a/openaddresses-batch-machine/container/Dockerfile b/openaddresses-batch-machine/container/Dockerfile new file mode 100644 index 0000000..b90c463 --- /dev/null +++ b/openaddresses-batch-machine/container/Dockerfile @@ -0,0 +1,26 @@ +FROM alpine:3.11 + +ENV BATCH_MACHINE_PATH=/batch-machine + +RUN apk add nodejs yarn git python3 python3-dev py3-pip \ + py3-gdal gdal gdal-dev make bash sqlite-dev zlib-dev \ + postgresql-libs gcc g++ musl-dev postgresql-dev cairo \ + py3-cairo file + +# Download and install Tippecanoe +RUN git clone -b 1.35.0 https://github.com/mapbox/tippecanoe.git /tmp/tippecanoe && \ + cd /tmp/tippecanoe && \ + make && \ + PREFIX=/usr/local make install && \ + rm -rf /tmp/tippecanoe + +# Get/install batch-machine +RUN git clone https://github.com/JosephKuchar/batch-machine $BATCH_MACHINE_PATH && \ + pip3 install $BATCH_MACHINE_PATH + +# Restrict to non-root access +RUN addgroup appgroup && \ + adduser -S -g appgroup appuser +USER appuser + +CMD python3 ${BATCH_MACHINE_PATH}/test.py diff --git a/openaddresses-batch-machine/pipeline/components/copy_to_minio.yaml b/openaddresses-batch-machine/pipeline/components/copy_to_minio.yaml new file mode 100644 index 0000000..352fbeb --- /dev/null +++ b/openaddresses-batch-machine/pipeline/components/copy_to_minio.yaml @@ -0,0 +1,35 @@ +name: Copy to Minio +inputs: +- {name: Minio URL, type: URL, description: 'Minio instance URL, starting with http://'} +- {name: Minio access key, type: String} +- {name: Minio secret key, type: String} +- {name: Local source, description: 'Local source of upload'} +- {name: Minio destination, type: String, description: 'Minio destination location in format /'} +- {name: Flags, optional: true, default: '', type: String, description: 'Flags/options passed to mc'} +outputs: +- {name: Minio destination, type: String} +- {name: md5sum, type: String, description: 'A combined md5sum of all data passed to MinIO'} +implementation: + container: + image: minio/mc + command: + - sh + - -ex + - -c + - | + FLAGS=$7 + mkdir -p "$(dirname "$5")" + mkdir -p "$(dirname "$6")" + mc config host add my_minio $0 $1 $2 + mc cp $FLAGS $3 my_minio/$4 + echo "$4" > "$5" + # Use find in case we retrieved a directory - this gets all files in the dir + find $3 -type f -exec md5sum {} \; | sort -k 2 | md5sum | awk '{print $1}' > $6 + - {inputValue: Minio URL} + - {inputValue: Minio access key} + - {inputValue: Minio secret key} + - {inputPath: Local source} + - {inputValue: Minio destination} + - {outputPath: Minio destination} + - {outputPath: md5sum} + - {inputValue: Flags} diff --git a/openaddresses-batch-machine/pipeline/components/openaddresses_get_data.yaml b/openaddresses-batch-machine/pipeline/components/openaddresses_get_data.yaml new file mode 100644 index 0000000..cfde79b --- /dev/null +++ b/openaddresses-batch-machine/pipeline/components/openaddresses_get_data.yaml @@ -0,0 +1,26 @@ +name: Download Data from OpenAddresses +inputs: +- {name: source_json, type: JsonObject, description: 'OpenAddresses source specification in JSON format'} +- {name: args, type: String, optional: true, default: '', description: 'Optional command line args to pass to openaddr-process-one, such as "--layer addresses --layersource city"'} +outputs: +- {name: data, description: 'All data downloaded from OpenAddresses call'} +implementation: + container: + image: k8scc01covidacr.azurecr.io/daaas-openaddresses-batch-machine:latest + command: + - sh + - -ex + - -c + - | + SOURCE_JSON=$0 + ARGS=$1 + OUTPUT_PATH=$2 + mkdir -p $OUTPUT_PATH + + cat $SOURCE_JSON + + openaddr-process-one $SOURCE_JSON $OUTPUT_PATH $ARGS + + - {inputPath: source_json} + - {inputValue: args} + - {outputPath: data} diff --git a/openaddresses-batch-machine/pipeline/get_openaddresses_data.ipynb b/openaddresses-batch-machine/pipeline/get_openaddresses_data.ipynb new file mode 100644 index 0000000..634533b --- /dev/null +++ b/openaddresses-batch-machine/pipeline/get_openaddresses_data.ipynb @@ -0,0 +1,289 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Summary\n", + "\n", + "Uses the OpenAddresses tooling to \n", + "\n", + "* download data based on a user-defined JSON source specification\n", + "* save the data in a user-defined location in MinIO\n", + "\n", + "Usage: \n", + "\n", + "* Edit the below settings and run notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "import kfp\n", + "from kfp.components import load_component_from_file\n", + "from kfp import dsl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# User settings:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "minio_bucket_name = 'FIRSTNAME-LASTNAME'\n", + "json_source_file = \"myfile.json\"\n", + "minio_tenant = 'minimal'\n", + "openaddresses_args = \"--layer addresses --layersource city\"\n", + "minio_output_uri = f'{minio_bucket_name}/path/to/storage/location'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Additional Inputs for Debugging/Testing:\n", + "\n", + "If testing/debugging, you can use the following demo JSON file. \n", + "\n", + "You can also use this method to write your own JSON file from the notebook if that makes sense for your workflow. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %%writefile $json_source_file\n", + "# {\n", + "# \"coverage\": {\n", + "# \"country\": \"ca\",\n", + "# \"state\": \"ab\",\n", + "# \"city\": \"Calgary\",\n", + "# \"geometry\": {\n", + "# \"type\": \"Point\",\n", + "# \"coordinates\": [\n", + "# -114.08,\n", + "# 51.08\n", + "# ]\n", + "# }\n", + "# },\n", + "# \"schema\": 2,\n", + "# \"layers\": {\n", + "# \"addresses\": [\n", + "# {\n", + "# \"name\": \"city\",\n", + "# \"data\": \"https://data.calgary.ca/api/views/uwj2-d2wc/rows.csv?accessType=DOWNLOAD\",\n", + "# \"website\": \"https://data.calgary.ca/\",\n", + "# \"license\": {\n", + "# \"url\": \"https://data.calgary.ca/stories/s/Open-Calgary-Terms-of-Use/u45n-7awa\",\n", + "# \"text\": \"Contains information licensed under the Open Government Licence – City of Calgary.\",\n", + "# \"attribution name\": \"City of Calgary\"\n", + "# },\n", + "# \"protocol\": \"http\",\n", + "# \"conform\": {\n", + "# \"format\": \"csv\",\n", + "# \"lat\": \"latitude\",\n", + "# \"lon\": \"longitude\",\n", + "# \"number\": {\n", + "# \"function\": \"join\",\n", + "# \"fields\": [\n", + "# \"HOUSE_NUMBER\",\n", + "# \"HOUSE_ALPHA\"\n", + "# ],\n", + "# \"separator\": \"\"\n", + "# },\n", + "# \"street\": [\n", + "# \"STREET_NAME\",\n", + "# \"STREET_TYPE\",\n", + "# \"STREET_QUAD\"\n", + "# ],\n", + "# \"str_name\": \"STREET_NAME\",\n", + "# \"str_type\": \"STREET_TYPE\",\n", + "# \"str_dir\": \"STREET_QUAD\",\n", + "# \"full_addr\": \"ADDRESS\"\n", + "# },\n", + "# \"attribution\": \"City of Calgary\"\n", + "# }\n", + "# ]\n", + "# }\n", + "# }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Component/Pipeline definitions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "openaddresses_get_op = load_component_from_file(\n", + " \"./components/openaddresses_get_data.yaml\"\n", + ")\n", + "copy_to_minio_op = load_component_from_file(\n", + " \"./components/copy_to_minio.yaml\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@dsl.pipeline(\n", + " name=\"Download OpenAddresses Data to Minio\"\n", + ")\n", + "def pipeline(\n", + " source_json,\n", + " minio_output_uri: str,\n", + " # TODO: Handle these automatically once multitenancy is available\n", + " minio_url,\n", + " minio_access_key: str,\n", + " minio_secret_key: str,\n", + " openaddresses_args: str = \"\",\n", + "):\n", + " operations = {}\n", + "\n", + " operations['Get Data'] = openaddresses_get_op(\n", + " source_json=source_json,\n", + " args=openaddresses_args,\n", + " ).set_image_pull_policy(\"Always\")\n", + "\n", + " operations['Store Data'] = copy_to_minio_op(\n", + " local_source=operations['Get Data'].outputs['data'],\n", + " minio_destination=minio_output_uri,\n", + " minio_url=minio_url,\n", + " minio_access_key=minio_access_key,\n", + " minio_secret_key=minio_secret_key,\n", + " flags=\"--recursive\", # Because outputs['data'] is a directory\n", + " )\n", + " # Set all operations display names to their key in the operations dict\n", + " for name, op in operations.items():\n", + " op.set_display_name(name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build pipeline arguments" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the JSON source file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open(json_source_file, 'r') as fin:\n", + " source_json = json.load(fin)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": { + "iopub.execute_input": "2020-10-21T18:02:22.261643Z", + "iopub.status.busy": "2020-10-21T18:02:22.261379Z", + "iopub.status.idle": "2020-10-21T18:02:22.264434Z", + "shell.execute_reply": "2020-10-21T18:02:22.263870Z", + "shell.execute_reply.started": "2020-10-21T18:02:22.261618Z" + } + }, + "source": [ + "## Get MinIO credentials from the Notebook Server (could also specify these things manually)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get minio credentials using a helper\n", + "from utilities import get_minio_credentials\n", + "\n", + "minio_settings = get_minio_credentials(minio_tenant, strip_http=False)\n", + "minio_url = minio_settings[\"url\"]\n", + "minio_access_key = minio_settings[\"access_key\"]\n", + "minio_secret_key = minio_settings[\"secret_key\"]\n", + "\n", + "arguments = dict(\n", + " source_json=json.dumps(source_json),\n", + " openaddresses_args=openaddresses_args,\n", + " minio_output_uri=minio_output_uri,\n", + " minio_url=minio_url,\n", + " minio_access_key=minio_access_key,\n", + " minio_secret_key=minio_secret_key,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run the pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_run = kfp.Client().create_run_from_pipeline_func(\n", + " pipeline,\n", + " arguments=arguments,\n", + " run_name=\"openaddresses-get-store-data\"\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/openaddresses-batch-machine/pipeline/utilities.py b/openaddresses-batch-machine/pipeline/utilities.py new file mode 100644 index 0000000..b18823f --- /dev/null +++ b/openaddresses-batch-machine/pipeline/utilities.py @@ -0,0 +1,135 @@ +import re + + +def parse_env_var_def(s): + """ + Parse string defining a shell environment variable, returning name and val + + Returns (varname, value) if matching pattern, else None + """ + match = re.search(r"\s*(?<=export)\s+([^=]+)=(.*)", s) + if match: + lhs, rhs = match.groups() + + # Remove whitespace and any quoted strings' quotes + lhs = lhs.strip().strip('\'').strip('"') + rhs = rhs.strip().strip('\'').strip('"') + # If both sides exist, return them + if lhs and rhs: + return lhs, rhs + return None + + +def get_env_variables_from_file(filepath): + """ + Returns a dictionary of the environment variables defined in a file + """ + with open(filepath, 'r') as fin: + lines = fin.readlines() + lines = [parse_env_var_def(line) for line in lines] + + # Return a dict of lhs:rhs, skipping any lines that are blank + return {line[0]: line[1] for line in lines if line} + + +def get_minio_credentials(tenant, strip_http=True, verbose=True): + """ + Retrieve minio credentials from the vault (available from notebook server) + + Args: + strip_http (bool): If True, strips http:// from the start of the minio + URL + tenant (str): Minio tenant name, such as "minimal" or "premium" + + Returns: + (dict): Dict with keys: + url + access_key + secret_key + """ + vault = f"/vault/secrets/minio-{tenant}-tenant1" + if verbose: + print("Trying to access minio credentials from:") + print(vault) + d = get_env_variables_from_file(vault) + + # Select only the keys that we want, also checking that they exist at all + key_map = { + "MINIO_URL": "url", + "MINIO_ACCESS_KEY": "access_key", + "MINIO_SECRET_KEY": "secret_key", + } + minio_credentials = {} + for k in key_map: + try: + minio_credentials[key_map[k]] = d[k] + except KeyError: + raise KeyError(f"Cannot find minio credential {k} in vault file") + + if strip_http: + # Get rid of http:// in minio URL + minio_credentials["url"] = re.sub(r'^https?://', + "", + minio_credentials["url"], + ) + + return minio_credentials + + +def create_bucket_if_missing(minio_obj, bucket): + if not minio_obj.bucket_exists(bucket): + minio_obj.make_bucket(bucket) + print(f"Created bucket {bucket}") + + +def copy_to_minio(minio_url, bucket, access_key, secret_key, sourcefile, + destination): + from minio import Minio + + # Store results to minio + s3 = Minio( + minio_url, + access_key=access_key, + secret_key=secret_key, + secure=False, + region="us-west-1", + ) + + # Create bucket if needed + create_bucket_if_missing(s3, bucket) + + # Put file into bucket + s3.fput_object(bucket, destination, sourcefile) + + +def minio_find_files_matching_pattern(minio_url, bucket, access_key, + secret_key, pattern, prefix='', + recursive=True): + """ + Returns all files in a minio location that match the given pattern + + This function is glob-like in idea, but uses regex patterns instead + of glob patterns. + """ + import re + from minio import Minio + pattern = re.compile(pattern) + + s3 = Minio( + minio_url, + access_key=access_key, + secret_key=secret_key, + secure=False, + region="us-west-1", + ) + + # Get everything in bucket/prefix + objs = s3.list_objects(bucket, prefix=prefix, recursive=True) + + # Discard directories + filepaths = [obj.object_name for obj in objs if not obj.is_dir] + + # Select only those that fit the pattern + matching = [filepath for filepath in filepaths if pattern.match(filepath)] + + return matching