GitBucket
4.21.2
Toggle navigation
Snippets
Sign in
Files
Branches
2
Releases
Issues
3
Pull requests
Labels
Priorities
Milestones
Wiki
Forks
nigel.stanger
/
docker-analytics
Browse code
Added graphframes for PySpark
master
spark3
1 parent
f2d35c3
commit
d15f6e57af753f287ddd4851a2bc03263e62c1b9
Nigel Stanger
authored
on 1 Oct 2020
Patch
Showing
2 changed files
docker-compose.yml
pyspark/Dockerfile
Ignore Space
Show notes
View
docker-compose.yml
version: "3.3" services: spark-master: image: "nstanger/spark:latest" container_name: spark-master hostname: spark-master ports: - "8080:8080" - "7077:7077" networks: - spark-network environment: - "SPARK_HOSTNAME=spark-master" - "SPARK_MASTER=spark://spark-master:7077" - "SPARK_WORKER_MEMORY=2g" command: "start-master.sh" volumes: - ${HOME}/tmp/sparkdata:/mnt/sparkdata spark-worker: image: nstanger/spark:latest depends_on: - spark-master ports: - 8080 networks: - spark-network environment: - "SPARK_MASTER=spark://spark-master:7077" - "SPARK_WORKER_WEBUI_PORT=8080" - "SPARKMONITOR_UI_PORT=8080" - "SPARK_WORKER_MEMORY=2g" command: "start-worker.sh" volumes: - ${HOME}/tmp/sparkdata:/mnt/sparkdata pyspark: image: nstanger/pyspark:latest depends_on: - spark-master ports: - "8888:8888" networks: - spark-network environment: - "SPARK_MASTER=spark://spark-master:7077" - "SPARK_MASTER_WEBUI_PORT=8080" - "PYSPARK_SUBMIT_ARGS=--master spark://spark-master:7077 --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.3,graphframes:graphframes:0.8.1-spark2.4-s_2.11 pyspark-shell" working_dir: /home/pyspark/work volumes: - ${HOME}/tmp/sparkdata:/mnt/sparkdata zookeeper: image: "bitnami/zookeeper:3" hostname: zookeeper ports: - "2181:2181" networks: - spark-network volumes: - "zookeeper_data:/bitnami" environment: - ALLOW_ANONYMOUS_LOGIN=yes kafka: image: "bitnami/kafka:2" hostname: kafka ports: - "9092:9092" networks: - spark-network environment: - "ALLOW_PLAINTEXT_LISTENER=yes" - "KAFKA_LISTENERS=PLAINTEXT://0.0.0.0:9092" - "KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://localhost:9092" - "KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181" volumes: - ${HOME}/tmp/sparkdata:/mnt/sparkdata - 'kafka_data:/bitnami' depends_on: - zookeeper volumes: zookeeper_data: driver: local kafka_data: driver: local networks: spark-network: driver: bridge ipam: driver: default
version: "3.3" services: spark-master: image: "nstanger/spark:latest" container_name: spark-master hostname: spark-master ports: - "8080:8080" - "7077:7077" networks: - spark-network environment: - "SPARK_HOSTNAME=spark-master" - "SPARK_MASTER=spark://spark-master:7077" - "SPARK_WORKER_MEMORY=2g" command: "start-master.sh" volumes: - ${HOME}/tmp/sparkdata:/mnt/sparkdata spark-worker: image: nstanger/spark:latest depends_on: - spark-master ports: - 8080 networks: - spark-network environment: - "SPARK_MASTER=spark://spark-master:7077" - "SPARK_WORKER_WEBUI_PORT=8080" - "SPARKMONITOR_UI_PORT=8080" - "SPARK_WORKER_MEMORY=2g" command: "start-worker.sh" volumes: - ${HOME}/tmp/sparkdata:/mnt/sparkdata pyspark: image: nstanger/pyspark:latest depends_on: - spark-master ports: - "8888:8888" networks: - spark-network environment: - "SPARK_MASTER=spark://spark-master:7077" - "SPARK_MASTER_WEBUI_PORT=8080" - "PYSPARK_SUBMIT_ARGS=--master spark://spark-master:7077 --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.3 pyspark-shell" working_dir: /home/pyspark/work volumes: - ${HOME}/tmp/sparkdata:/mnt/sparkdata zookeeper: image: "bitnami/zookeeper:3" hostname: zookeeper ports: - "2181:2181" networks: - spark-network volumes: - "zookeeper_data:/bitnami" environment: - ALLOW_ANONYMOUS_LOGIN=yes kafka: image: "bitnami/kafka:2" hostname: kafka ports: - "9092:9092" networks: - spark-network environment: - "ALLOW_PLAINTEXT_LISTENER=yes" - "KAFKA_LISTENERS=PLAINTEXT://0.0.0.0:9092" - "KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://localhost:9092" - "KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181" volumes: - ${HOME}/tmp/sparkdata:/mnt/sparkdata - 'kafka_data:/bitnami' depends_on: - zookeeper volumes: zookeeper_data: driver: local kafka_data: driver: local networks: spark-network: driver: bridge ipam: driver: default
Ignore Space
Show notes
View
pyspark/Dockerfile
# Don't use the official jupyter/pyspark-notebook image because it's massive! FROM nstanger/spark # Ideally we should run as a non-root user, but it's problematic to set # up shared files, especially if running Docker within, say, VirtualBox. # See https://vsupalov.com/docker-shared-permissions/ # ARG NB_USER="pyspark" # ARG NB_UID="1000" # ARG NB_GID="1000" USER root # RUN adduser -D -G root $NB_USER RUN apk add --no-cache --virtual .build-deps \ build-base \ zeromq-dev \ python3-dev \ libffi-dev \ py3-zmq RUN pip install --upgrade \ pip \ && pip install \ # Tornado 6 breaks sparkmonitor tornado==5.1 \ jupyter \ kafka-python \ pyspark \ sparkmonitor \ graphframes \ tini RUN apk del .build-deps ENV PYSPARK_PYTHON="/usr/local/bin/python3" ENV KERNEL_NAME="PySpark" \ KERNELS_TEMPLATE_PATH="/tmp" \ KERNELS_DIR_PATH="/home/$NB_USER/.local/share/jupyter/kernels" \ PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip" \ PYSPARK_DRIVER_PYTHON="$PYSPARK_PYTHON" \ PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell" \ PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py" # ENV SPARKMONITOR_UI_PORT 8080 # USER $NB_USER RUN ipython kernel install --name $KERNEL_NAME # kernel.json is based on the template from <https://github.com/Anchormen/pyspark-jupyter-kernels>. COPY kernel.json pyspark-kernel.sh $KERNELS_DIR_PATH/$KERNEL_NAME/ RUN jupyter nbextension install sparkmonitor --py --symlink \ && jupyter nbextension enable sparkmonitor --py \ && jupyter serverextension enable --py sparkmonitor \ && ipython profile create \ && echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >> $(ipython profile locate default)/ipython_kernel_config.py # RUN mkdir -p /home/$NB_USER/work # WORKDIR /home/$NB_USER/work ENTRYPOINT ["/sbin/tini", "--"] # pyspark-kernel.sh will automatically set up the PySpark context when # the kernel is loaded. CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--allow-root"] # debugging # CMD ["bash"]
# Don't use the official jupyter/pyspark-notebook image because it's massive! FROM nstanger/spark # Ideally we should run as a non-root user, but it's problematic to set # up shared files, especially if running Docker within, say, VirtualBox. # See https://vsupalov.com/docker-shared-permissions/ # ARG NB_USER="pyspark" # ARG NB_UID="1000" # ARG NB_GID="1000" USER root # RUN adduser -D -G root $NB_USER RUN apk add --no-cache --virtual .build-deps \ build-base \ zeromq-dev \ python3-dev \ libffi-dev \ py3-zmq RUN pip install --upgrade \ pip \ && pip install \ # Tornado 6 breaks sparkmonitor tornado==5.1 \ jupyter \ kafka-python \ pyspark \ sparkmonitor \ tini RUN apk del .build-deps ENV PYSPARK_PYTHON="/usr/local/bin/python3" ENV KERNEL_NAME="PySpark" \ KERNELS_TEMPLATE_PATH="/tmp" \ KERNELS_DIR_PATH="/home/$NB_USER/.local/share/jupyter/kernels" \ PYTHONPATH="$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip" \ PYSPARK_DRIVER_PYTHON="$PYSPARK_PYTHON" \ PYSPARK_SUBMIT_ARGS="--master $SPARK_MASTER pyspark-shell" \ PYTHONSTARTUP="$SPARK_HOME/python/pyspark/shell.py" # ENV SPARKMONITOR_UI_PORT 8080 # USER $NB_USER RUN ipython kernel install --name $KERNEL_NAME # kernel.json is based on the template from <https://github.com/Anchormen/pyspark-jupyter-kernels>. COPY kernel.json pyspark-kernel.sh $KERNELS_DIR_PATH/$KERNEL_NAME/ RUN jupyter nbextension install sparkmonitor --py --symlink \ && jupyter nbextension enable sparkmonitor --py \ && jupyter serverextension enable --py sparkmonitor \ && ipython profile create \ && echo "c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')" >> $(ipython profile locate default)/ipython_kernel_config.py # RUN mkdir -p /home/$NB_USER/work # WORKDIR /home/$NB_USER/work ENTRYPOINT ["/sbin/tini", "--"] # pyspark-kernel.sh will automatically set up the PySpark context when # the kernel is loaded. CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--allow-root"] # debugging # CMD ["bash"]
Show line notes below