From d59cac42a4782584158ffc75a8f974c5a5740ed5 Mon Sep 17 00:00:00 2001 From: Andrew Scribner Date: Thu, 17 Sep 2020 16:54:14 -0400 Subject: [PATCH 1/3] feat: add extension to minimal-notebook for web scraping --- yoon-minimal-web-scraping/Dockerfile | 42 ++++++++++++++++++++++++++++ yoon-minimal-web-scraping/README.md | 16 +++++++++++ yoon-minimal-web-scraping/build.sh | 10 +++++++ 3 files changed, 68 insertions(+) create mode 100644 yoon-minimal-web-scraping/Dockerfile create mode 100644 yoon-minimal-web-scraping/README.md create mode 100755 yoon-minimal-web-scraping/build.sh diff --git a/yoon-minimal-web-scraping/Dockerfile b/yoon-minimal-web-scraping/Dockerfile new file mode 100644 index 0000000..c20290b --- /dev/null +++ b/yoon-minimal-web-scraping/Dockerfile @@ -0,0 +1,42 @@ +# ex: FROM k8scc01covidacr.azurecr.io/minimal-notebook-cpu:5ef877ea13789f64594c219ef0a302dc97c21bb4 +ARG BASE_CONTAINER +FROM $BASE_CONTAINER +USER root + +RUN apt-get update && apt-get install -y software-properties-common --no-install-recommends \ + && apt-get install -y chromium-browser chromium-browser-l10n chromium-codecs-ffmpeg \ + && ln -s /usr/bin/chromium-browser /usr/bin/google-chrome \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN wget -q https://chromedriver.storage.googleapis.com/85.0.4183.87/chromedriver_linux64.zip && \ + unzip chromedriver_linux64.zip && \ + rm chromedriver_linux64.zip && \ + chmod a+x chromedriver && \ + mv chromedriver /usr/bin/ && \ + wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | sudo apt-key add - && \ + echo 'deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main' | sudo tee /etc/apt/sources.list.d/google-chrome.list + +RUN sudo apt-get update && \ + sudo apt-get -y install google-chrome-stable && \ + sudo apt-get clean + +RUN pip install --no-cache-dir 'selenium==3.141.0' && \ + fix-permissions $CONDA_DIR && \ + fix-permissions /home/$NB_USER + +# Configure container startup +EXPOSE 8888 +USER jovyan +ENTRYPOINT ["tini", "--"] +CMD ["start-custom.sh"] + +# # To test in python: +# from selenium import webdriver +# from selenium.webdriver.chrome.options import Options + +# chrome_options = Options() +# chrome_options.add_argument('--headless') +# chrome_options.add_argument('--no-sandbox') +# d = webdriver.Chrome(chrome_options=chrome_options) +# d.get("https://www.google.com") diff --git a/yoon-minimal-web-scraping/README.md b/yoon-minimal-web-scraping/README.md new file mode 100644 index 0000000..f5c6579 --- /dev/null +++ b/yoon-minimal-web-scraping/README.md @@ -0,0 +1,16 @@ +# Summary + +Custom Jupyter server built with Chrome and selenium for web scraping. Extends a pinned version of the `minimal-notebook-cpu` image. + +# Build/Update Instructions + +(must have permission to push to k8scc01covidacr) + +``` +# Edit build.sh to set image VERSION +# Edit build.sh to pin to the desired minimal-nobook-cpu image + +az acr login --name k8scc01covidacr + +./build.sh +``` diff --git a/yoon-minimal-web-scraping/build.sh b/yoon-minimal-web-scraping/build.sh new file mode 100755 index 0000000..667fe37 --- /dev/null +++ b/yoon-minimal-web-scraping/build.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +# VERSION can be anything, just don't overwrite previous items +VERSION="YYYY-MM-DD_VERSIONNUMBER" +IMAGE_TAG="k8scc01covidacr.azurecr.io/yoon-minimal-web-scraping:$VERSION" +BASE_CONTAINER="k8scc01covidacr.azurecr.io/minimal-notebook-cpu:5ef877ea13789f64594c219ef0a302dc97c21bb4" +docker build -t $IMAGE_TAG --build-arg BASE_CONTAINER=$BASE_CONTAINER . +# docker run -p 8888:8888 $IMAGE_TAG + +# Must be logged into az acr (az acr login --name k8scc01covidacr) +docker push $IMAGE_TAG From 49af2afddef886c7a626c28890b6c34026c6c9e8 Mon Sep 17 00:00:00 2001 From: Andrew Scribner Date: Thu, 17 Sep 2020 16:58:13 -0400 Subject: [PATCH 2/3] docs: add acr entry to readme.md for easy use later --- yoon-minimal-web-scraping/README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/yoon-minimal-web-scraping/README.md b/yoon-minimal-web-scraping/README.md index f5c6579..790f9e8 100644 --- a/yoon-minimal-web-scraping/README.md +++ b/yoon-minimal-web-scraping/README.md @@ -2,6 +2,12 @@ Custom Jupyter server built with Chrome and selenium for web scraping. Extends a pinned version of the `minimal-notebook-cpu` image. +# Existing versions + +Paste these into the custom notebook image in the `New Server` page to use them + +* k8scc01covidacr.azurecr.io/yoon-minimal-web-scraping:2020-09-17_1 + # Build/Update Instructions (must have permission to push to k8scc01covidacr) @@ -13,4 +19,5 @@ Custom Jupyter server built with Chrome and selenium for web scraping. Extends az acr login --name k8scc01covidacr ./build.sh +# Add to Existing versions above if sharing with others ``` From ed1b9f99ddbec308579374f7f07bd1f8cf7afb79 Mon Sep 17 00:00:00 2001 From: oguolad Date: Tue, 8 Mar 2022 18:36:56 +0000 Subject: [PATCH 3/3] new docker files --- dayo-web-scraping/Dockerfile | 40 ++++++++++++++++++++++++++++++++++++ dayo-web-scraping/README.md | 5 +++++ 2 files changed, 45 insertions(+) create mode 100644 dayo-web-scraping/Dockerfile create mode 100644 dayo-web-scraping/README.md diff --git a/dayo-web-scraping/Dockerfile b/dayo-web-scraping/Dockerfile new file mode 100644 index 0000000..69db755 --- /dev/null +++ b/dayo-web-scraping/Dockerfile @@ -0,0 +1,40 @@ +# ex: FROM k8scc01covidacr.azurecr.io/minimal-notebook-cpu:5ef877ea13789f64594c219ef0a302dc97c21bb4 +ARG BASE_CONTAINER +FROM $BASE_CONTAINER +USER root + +RUN apt-get update && apt-get install -y software-properties-common --no-install-recommends \ + && apt-get install -y chromium-browser chromium-browser-l10n chromium-codecs-ffmpeg \ + && ln -s /usr/bin/chromium-browser /usr/bin/google-chrome \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN wget -q https://chromedriver.storage.googleapis.com/85.0.4183.87/chromedriver_linux64.zip && \ + unzip chromedriver_linux64.zip && \ + rm chromedriver_linux64.zip && \ + chmod a+x chromedriver && \ + mv chromedriver /usr/bin/ && \ + wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | sudo apt-key add - && \ + echo 'deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main' | sudo tee /etc/apt/sources.list.d/google-chrome.list + +RUN sudo apt-get update && \ + sudo apt-get -y install google-chrome-stable && \ + sudo apt-get clean + +RUN pip install --no-cache-dir 'selenium==3.141.0' && \ + fix-permissions $CONDA_DIR && \ + fix-permissions /home/$NB_USER + +RUN pip install --upgrade pip \ + --no-cache-dir 'playwright==1.19.1' && \ + fix-permissions $CONDA_DIR && \ + fix-permissions /home/$NB_USER + +RUN sudo apt-get update && \ + sudo playwright install-deps + +# Configure container startup +EXPOSE 8888 +USER jovyan +ENTRYPOINT ["tini", "--"] +CMD ["start-custom.sh"] diff --git a/dayo-web-scraping/README.md b/dayo-web-scraping/README.md new file mode 100644 index 0000000..6c8324a --- /dev/null +++ b/dayo-web-scraping/README.md @@ -0,0 +1,5 @@ +# Summary + +Custom Jupyter server built with Chrome, playwright, and selenium for web scraping. Extends a pinned version of the `minimal-notebook-cpu` image. + +