diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 109c7ae..ecc06a1 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -15,6 +15,7 @@ on: push: branches: - master + - feat-notebook-restart # Environment variables available to all jobs and steps in this workflow env: diff --git a/notebook-restart/Dockerfile b/notebook-restart/Dockerfile new file mode 100644 index 0000000..652a7cb --- /dev/null +++ b/notebook-restart/Dockerfile @@ -0,0 +1,65 @@ +FROM ubuntu:20.04 + +USER root + +ARG NOTROOTUSER="jovyan" +ARG NB_UID="1000" +ARG NB_GID="100" +WORKDIR /home/$NOTROOTUSER +ENV GOPATH=/home/jovyan/go + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + 'jq' \ + 'curl' \ + 'ca-certificates' \ + 'git' + +# Install kubectl +ARG KUBECTL_VERSION=v1.23.0 +ARG KUBECTL_URL=https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl +ARG KUBECTL_SHA=2d0f5ba6faa787878b642c151ccb2c3390ce4c1e6c8e2b59568b3869ba407c4f + +RUN curl -LO "${KUBECTL_URL}" \ + && echo "${KUBECTL_SHA} kubectl" | sha256sum -c - \ + && chmod +x ./kubectl \ + && mv ./kubectl /usr/local/bin/kubectl + +# Operate in non root +RUN useradd $NOTROOTUSER +ENV NOTROOTUSER="${NOTROOTUSER}" \ + NB_UID=${NB_UID} \ + NB_GID=${NB_GID} + +#Install GO, this, crane, and expect are likely uneeded at the moment +#ARG GO_VERSION=1.16.7 +#ARG GO_SHA=7fe7a73f55ba3e2285da36f8b085e5c0159e9564ef5f63ee0ed6b818ade8ef04 +#ARG CRANE_VERSION=v0.8.0 + +#RUN curl -OL https://golang.org/dl/go$GO_VERSION.linux-amd64.tar.gz \ +# && echo "${GO_SHA} go$GO_VERSION.linux-amd64.tar.gz" | sha256sum -c - \ +# && tar -C /usr/local -xvf go$GO_VERSION.linux-amd64.tar.gz +#ENV PATH $PATH:/usr/local/go/bin:$GOPATH/bin + +#Install Crane +#RUN go install github.com/google/go-containerregistry/cmd/crane@$CRANE_VERSION + +#Copy over scripts +COPY scripts /home/$NOTROOTUSER +RUN chmod -R 755 /home/$NOTROOTUSER +COPY scripts-input /home/$NOTROOTUSER +RUN chown -R $NOTROOTUSER /home/$NOTROOTUSER +# COPY --chmod=0755 scripts /home/$NOTROOTUSER # this needs buildkit + +# Install Azure CLI +RUN curl -sL https://aka.ms/InstallAzureCLIDeb | bash + +#ENV DEBIAN_FRONTEND=noninteractive +#RUN apt-get update && \ +# apt-get install -y --no-install-recommends \ +# 'expect' \ +# 'nano' +USER $NOTROOTUSER + +# docker run -it --rm --entrypoint /bin/bash test +CMD ["./0-the-co-ordinate.sh"] diff --git a/notebook-restart/README.md b/notebook-restart/README.md new file mode 100644 index 0000000..fc546f5 --- /dev/null +++ b/notebook-restart/README.md @@ -0,0 +1,40 @@ +# patch-notebook-sts +https://github.com/StatCan/daaas/issues/957 + +### IMPORTANT MAKE SURE THAT THE INPUT FILES HAVE THE LF ENDING AND NOT CRLF ENDING PLEASE + +This will be in a cronjob + +Maybe give the whole thing an argument say we can specify, "look for these tags" or something. +^ might not be necessary on the assumption that everyone is on a v1 tag or some other long lived tag. +^ that or if they for whatever reason are not on a long lived tag then in that case we go and update to one. + +## Pre-requisites +1) We need all our aaw-kubeflow-container images being used to be updated to use the long-lived tag `v1` +2) We need the `imagePullPolicy` to be changed to `Always` for all notebook pods. + + +## Requirements +Must be ran with `execute` in the arguments for it to not do a dry run + +## General Flow of it all +### Step 1 Get the list of notebook statefulsets and their digest +Using `kubectl` get a list of notebook statefulsets and their digests. This will be a file +with each line being valid json for easy comprehension. The line will look like this for example + +```{"stsName":"othernotebook","image":{"containerID":"containerd://8324ffc23ab255fc1afbff9c7b0005c2ec024c5dcbeefa2b5a5f46807e1915f9","image":"k8scc01covidacr.azurecr.io/rstudio:c5b7982c","imageID":"k8scc01covidacr.azurecr.io/rstudio@sha256:b4465e2f5a92ee0505d01401516d9ba6e8084801700177f4ecad62be8fe23d9a","lastState":{},"name":"othernotebook","ready":true,"restartCount":0,"started":true,"state":{"running":{"startedAt":"2022-06-16T11:24:04Z"}}}}``` + +### Step 2 Retrieve the digest of the "latest" long-lived tag of the AAW-provisioned images +*IMPORTANT* This must be called with an argument of the long-lived tag. For example with `v1` +In this current case it will be `v1`. We will use either the AZ cli or the JFROG Rest API to retrieve this. +The `v1` tag itself is long-lived and gets overwritten with each push to the aaw-kubeflow-containers `master` branch. +This `digest` which corresponds to the `imageID` from above is how we will know if a users `statefulset` has to be restarted. If it does not match then it has to be restarted + + +### Step 3 Compare the outputs of Step 1 and Step 2 +We reduce the list `1-simple-output.txt` first by using the `2-images-with-tags.txt` to only _keep_ the images from `1-simple-output.txt` that contain that image and the tag. This is helpful for when we have multiple tags +say `v1` and `v2`, both of which we want to update. + + +### Step 4 Execute rolling restarts of the statefulsets from Step 3 +Go line by line and kubectl restart them diff --git a/notebook-restart/scripts-input/0-list-of-tags.txt b/notebook-restart/scripts-input/0-list-of-tags.txt new file mode 100644 index 0000000..626799f --- /dev/null +++ b/notebook-restart/scripts-input/0-list-of-tags.txt @@ -0,0 +1 @@ +v1 diff --git a/notebook-restart/scripts-input/1-aaw-images.txt b/notebook-restart/scripts-input/1-aaw-images.txt new file mode 100644 index 0000000..38f7f85 --- /dev/null +++ b/notebook-restart/scripts-input/1-aaw-images.txt @@ -0,0 +1,5 @@ +k8scc01covidacr.azurecr.io/rstudio +k8scc01covidacr.azurecr.io/jupyterlab-cpu +k8scc01covidacr.azurecr.io/jupyterlab-tensorflow +k8scc01covidacr.azurecr.io/jupyterlab-pytorch +k8scc01covidacr.azurecr.io/remote-desktop diff --git a/notebook-restart/scripts-input/README.md b/notebook-restart/scripts-input/README.md new file mode 100644 index 0000000..8b593c8 --- /dev/null +++ b/notebook-restart/scripts-input/README.md @@ -0,0 +1,39 @@ +This folder contains all the necessary scripts, as well as any necessary input text files for this to run. + + +## Required Input .txt files +### Note that a `newline` is _required_ at the end of each text file. +They must also be `LF` files as `CRLF` has ruined me in the past + +### 0-list-of-tags.txt +A file containing the tags we want to check for +They must be newline separated and there _must be a newline at the end of the file_ +The co-ordinate will use this and loop through this. +Example, if the file contents are +``` +v1 +v2 + +``` +Then the scripts will be ran for both `v1` and `v2` tags, meaning that both `v1` and `v2` tags will be checked for and if nececssary restarted. + +### 1-aaw-images.txt +This is a file containing images we want to grep for and keep. +They must be newline separated and there _must be a newline at the end of the file_ +This is used in combination with `0-list-of-tags.txt` to generate the full name of the registry + repository + tag. +Example File +``` +k8scc01covidacr.azurecr.io/rstudio +k8scc01covidacr.azurecr.io/jupyterlab-cpu +k8scc01covidacr.azurecr.io/jupyterlab-tensorflow +k8scc01covidacr.azurecr.io/jupyterlab-pytorch +k8scc01covidacr.azurecr.io/remote-desktop + +``` + +## Generated Output .txt files + +### Note +Some of these .txt files (notably the ones that are `>>`'d ) are `rm`'d at the beginning of each script. +This is to prevent when we have multiple tags in `0-list-of-tags.txt` from whacky funky overlap. +Naturally the `>` files are overwritten so those do not need to be `rm`'d diff --git a/notebook-restart/scripts/0-the-co-ordinate.sh b/notebook-restart/scripts/0-the-co-ordinate.sh new file mode 100755 index 0000000..dfde69d --- /dev/null +++ b/notebook-restart/scripts/0-the-co-ordinate.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +## This co-ordinates everything. +# Possible idea is that if we want to check for multiple tags (say v1 and v2) +# then we just put the entire thing in a loop (clean up at end of each loop). + +# This is a preventative measure against myself personally +if [ -z $1 ]; then + echo "Must specify what you want to run with, either dry-run or execute" + echo "Nothing will happen kubectl run-wise without it" + exit 0 +fi + +echo "Starting..." +echo "Taking the input file 0-list-of-tags.txt and using that to sort through" +readarray -t tagsToRead < 0-list-of-tags.txt +for i in "${tagsToRead[@]}" +do + ./1-get-notebooks.sh + ./2-get-latest-manifest.sh $i + ./3-compare-live-to-recent.sh + ./4-rolling-restart.sh $1 +done + +echo "Ending..." diff --git a/notebook-restart/scripts/1-get-notebooks.sh b/notebook-restart/scripts/1-get-notebooks.sh new file mode 100644 index 0000000..a224ce1 --- /dev/null +++ b/notebook-restart/scripts/1-get-notebooks.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +############################################### +# Goal: Obtain a consumable list of entries that we can use to compare against the +# most recent "v1" tag with information necessary to restart the statefulset as well +##### +# Actions: kubectl to obtain a json valid line (by line) of images. The file in its entirety is not valid json each line +# is valid json. +# Required Files: 1-aaw-images.txt, full image name of registry and repository. +# This must be line separated and must contain a newline at the end +## EXAMPLE +#k8scc01covidacr.azurecr.io/rstudio +#k8scc01covidacr.azurecr.io/jupyterlab-cpu +## END EXAMPLE +# Notes: You cannot simply use .status.containerStatuses[1] because it's not guaranteed +# for the [1]th element to be the notebook. +############################################### + +## CLEANUP (for when iterating through a list of tags) +rm 1-simple-output.txt + +# Get a list of pods that have the label 'notebook-name' across all namespaces. + +# .items[] | {stsName: (.metadata.ownerReferences[].name), image: (.status.containerStatuses[])} +# ^will give 3 entries per thing but it almost seems unavoidable. +# I don't like the option of `.status.containerStatuses` because it is harder to remove what I don't need +kubectl get pods -l 'notebook-name' -A -o json | + jq -c '.items[] | select(.status.containerStatuses != null) | {stsName: (.metadata.ownerReferences[].name), namespace: (.metadata.namespace), image: (.status.containerStatuses[])}' > 1-full-list.txt + +# Using newline separated list of images, make a variable containing all the images we want to look out for +# These images will ideally have long-lived tags +readarray -t images < 1-aaw-images.txt +unset list_to_grep_for +for i in "${images[@]}" +do + if [[ $i == ${images[-1]} ]]; then + list_to_grep_for+=$i + else + list_to_grep_for+=${i}:'\'\| + fi +done + +grep $list_to_grep_for 1-full-list.txt > 1-pared-down-list.txt + +### Now we want to format this output into something easier to understand. +# {stsName: .stsName, namespace: .namespace, image: .image.image, imageSHA: .image.imageID} +# I do not think there is a good simple way of doing this in an earlier step +# as the file itself is not valid json, only each line is. + +while IFS= read -r line; do + echo $line | jq -c '{stsName: .stsName, namespace: .namespace, image: .image.image, imageSHA: .image.imageID}' >> 1-simple-output.txt +done < 1-pared-down-list.txt diff --git a/notebook-restart/scripts/2-get-latest-manifest.sh b/notebook-restart/scripts/2-get-latest-manifest.sh new file mode 100644 index 0000000..905e8ee --- /dev/null +++ b/notebook-restart/scripts/2-get-latest-manifest.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +############################################### +# Goal: Obtain the +# "imageID":"k8scc01covidacr.azurecr.io/rstudio@sha256:b4465e2f5a92ee0505d01401516d9ba6e8084801700177f4ecad62be8fe23d9a" +# of the current / most recently pushed tag (in our case v1). +##### +# Actions: Pull the latest image digest from the acr +############################################### + + +#Cleanup files +rm 2-registry-images-with-shas.txt +rm 2-shas-to-remove.txt + +# Add the argument to the end of each file +if [ -z "$1" ]; then + echo "No arguments provided will assume v1" + sed s/$/:v1/ 1-aaw-images.txt > 2-images-with-tags.txt + else + sed s/$/:${1}/ 1-aaw-images.txt > 2-images-with-tags.txt +fi + +# IMPORTANT +## IF I CHOOSE TO USE THE ACR. I will need to also format the images to become like +# jupyterlab-cpu:v1 (aka no repository) as this works +# az acr repository show -n k8scc01covidacr --image jupyterlab-cpu:v1 but using +# k8scc01covidacr.azurecr.io/jupyterlab-cpu:v1 for --image will not. +# Regardless if I use artifactory or acr, I will need to iterate through the 2-images-with-tags.txt file + +# Retrieve just the digest, as we can get images that share the "sha" meaning they do not need to be restarted. +while IFS= read -r line; do + imageTag=${line#*/} + repository=${line%%/*} + imageSha=$(az acr repository show -n $repository --image $imageTag --username $ACR_READ_METADATA_USERNAME --password $ACR_READ_METADATA_PASSWORD | jq -r '.digest') + echo $imageSha >> 2-shas-to-remove.txt +done < 2-images-with-tags.txt + +# Other implementation Notes / possibilities (Using Artifactory) +# In Artifactory Digest:sha256:95e1b3d264e67b417e6a2f3c6b64bb8a2b55d8e495fca1fd97a312135e2af6fa +# Will need crane for this I think https://github.com/google/go-containerregistry/blob/main/cmd/crane/doc/crane.md +# that or I use https://www.jfrog.com/confluence/display/JFROG/Artifactory+REST+API + +# IF USING ARTIFACTORY +# We can use the `1-aaw-images.txt` and append a `:v1` to the end to check it. +# If we truly want to be safe, we can _pull_ before we run queries to get it. +# But with the automated scanning running each night, it should do that anyways. +# Use JFrog (will need to do a pull to confirm it's the most recent version of `v1`) +# ^ that may be an intermediary 2-A step, keep this as just querying Artifactory for the sha \ No newline at end of file diff --git a/notebook-restart/scripts/3-compare-live-to-recent.sh b/notebook-restart/scripts/3-compare-live-to-recent.sh new file mode 100644 index 0000000..d28c5c9 --- /dev/null +++ b/notebook-restart/scripts/3-compare-live-to-recent.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +############################################### +# Use what is in 2-registry-images-with-shas.txt and narrow down the +# 1-simple-output.txt to only contain entries which +# A) The repository name (ex jupyterlab-cpu) matches the image name and +# B) Do not MATCH what we have from 2-registry-images-with-shas in the sha category. + +#Cleanup make sure everything is clean +rm 3-reduce-to-wanted-image-tags.txt + +# Now the way we match on the imageTag itself may change depending on if we use artifactory or the acr + +# Will have a separate job to migrate all workspaces to the long lived tag. +# So we do a full match on the image tag, and then a negative match on the sha's that do not match. + +# Step 1 List 1 +# Reduce the list (1-simple-output.txt) by using what we have in 2-images-with-tags +# Given the example of calling script `./2-xyz.sh v1` where we specify the tag of the image we want +# We reduce the 1-simple-output.txt to a text file that only has those in 2-images-with-tags +while IFS= read -r line; do + grep $line 1-simple-output.txt >> 3-reduce-to-wanted-image-tags.txt +done < 2-images-with-tags.txt + +# Step 2 List 2 +# Reduce the list (3-reduce-to-wanted-image-tags.txt) by iterating through the +# 2-registry-images-with-shas.txt and removing any sts that contain the EXACT sha256:... +# Make a copy of the 3-reduce-to-wanted-images.txt can remove later, keeping it for testing +cp 3-reduce-to-wanted-image-tags.txt 3-statefulsets-to-restart.txt + +# 2-shas-to-remove is a list of shas that match up to the current long lived tag (ex v1) +# And thus we do not want to restart these so we take them off the list. +readarray -t shas < 2-shas-to-remove.txt +for i in "${shas[@]}" +do + shaToRemove=$(echo $i | xargs) + sed -i "/$shaToRemove/d" 3-statefulsets-to-restart.txt +done + +# Gist for for loop test https://gist.github.com/Jose-Matsuda/e0b339d115ad25b36b8b9f455dbacf02 diff --git a/notebook-restart/scripts/4-rolling-restart.sh b/notebook-restart/scripts/4-rolling-restart.sh new file mode 100755 index 0000000..c0bfcbf --- /dev/null +++ b/notebook-restart/scripts/4-rolling-restart.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +############################################### +# Goal: Perform a rolling restart using the entries in 3-statefulsets-to-restart.txt +# The line should look something like (a one liner) +# {"stsName":"othernotebook", "namespace": "jose-matsuda", "imageSHA": +# "k8scc01covidacr.azurecr.io/rstudio@sha256:b4465e2f5a92ee0505d01401516d9ba6e8084801700177f4ecad62be8fe23d9a"} +# It is also at this point which we could probably implement any "exclusions" +# on any namespaces or statefulsets, but do not do that now. +#kubectl rollout restart statefulset/$line -n blah + +# -r for raw, without it keeps the quotes, tested can confirm it works +# This is a job so the sleep does not matter as much, half a second should be ok +function dryrun() { + loopCounter=0 + while IFS= read -r line; do + namespace=$(echo $line | jq -r '.namespace') + statefulsetname=$(echo $line | jq -r '.stsName') + if [ $loopCounter -eq 20 ]; then + echo "Triggered 20 restarts sleeping for 10 seconds" + sleep 10 + loopCounter=0 + fi + echo "kubectl rollout restart statefulset/$statefulsetname -n $namespace" + ((loopCounter=loopCounter+1)) + done < 3-statefulsets-to-restart.txt +} + +function execute() { + loopCounter=0 + while IFS= read -r line; do + namespace=$(echo $line | jq -r '.namespace') + statefulsetname=$(echo $line | jq -r '.stsName') + if [ $loopCounter -eq 20 ]; then + echo "Triggered 20 restarts sleeping for 10 seconds" + sleep 10 + loopCounter=0 + fi + kubectl rollout restart statefulset/$statefulsetname -n $namespace + ((loopCounter=loopCounter+1)) + done < 3-statefulsets-to-restart.txt +} + + +if [ $1 == "execute" ]; then + execute + else + dryrun +fi diff --git a/notebook-restart/scripts/README.md b/notebook-restart/scripts/README.md new file mode 100644 index 0000000..c6fabc6 --- /dev/null +++ b/notebook-restart/scripts/README.md @@ -0,0 +1,21 @@ +## General Flow of it all +### Step 1 Get the list of notebook statefulsets and their digest +Using `kubectl` get a list of notebook statefulsets and their digests. This will be a file +with each line being valid json for easy comprehension. The line will look like this for example + +```{"stsName":"othernotebook","image":{"containerID":"containerd://8324ffc23ab255fc1afbff9c7b0005c2ec024c5dcbeefa2b5a5f46807e1915f9","image":"k8scc01covidacr.azurecr.io/rstudio:c5b7982c","imageID":"k8scc01covidacr.azurecr.io/rstudio@sha256:b4465e2f5a92ee0505d01401516d9ba6e8084801700177f4ecad62be8fe23d9a","lastState":{},"name":"othernotebook","ready":true,"restartCount":0,"started":true,"state":{"running":{"startedAt":"2022-06-16T11:24:04Z"}}}}``` + +### Step 2 Retrieve the digest of the "latest" long-lived tag of the AAW-provisioned images +*IMPORTANT* This must be called with an argument of the long-lived tag. For example with `v1` +In this current case it will be `v1`. We will use either the AZ cli or the JFROG Rest API to retrieve this. +The `v1` tag itself is long-lived and gets overwritten with each push to the aaw-kubeflow-containers `master` branch. +This `digest` which corresponds to the `imageID` from above is how we will know if a users `statefulset` has to be restarted. If it does not match then it has to be restarted + + +### Step 3 Compare the outputs of Step 1 and Step 2 +We reduce the list `1-simple-output.txt` first by using the `2-images-with-tags.txt` to only _keep_ the images from `1-simple-output.txt` that contain that image and the tag. This is helpful for when we have multiple tags +say `v1` and `v2`, both of which we want to update. + + +### Step 4 Execute rolling restarts of the statefulsets from Step 3 +Go line by line and kubectl restart them