Merge pull request #66 from aws-samples/nmurich_docker_python_update

JohnChe88 · web-flow · commit abfcb58457c4 · 2025-07-16T09:03:57.000-04:00
Summary
Refactored Spark Lambda Dockerfile to use multi-stage builds for optimal container size and added Ubuntu variant with comprehensive documentation.

Key Changes
🚀 Performance &amp; Optimization:

Implemented multi-stage Docker build reducing final image size

Consolidated RUN commands to minimize Docker layers

Added --no-cache-dir flags for pip installations

Improved cleanup procedures removing temporary files and caches

⬆️ Runtime Modernization:

Upgraded Python runtime from 3.8 → 3.10

Upgraded Java from OpenJDK 1.8 → Amazon Corretto 11

Updated environment paths to reflect Python 3.10 structure

Enhanced security with proper version locking removal

🐧 Platform Extension:

Added Dockerfile.ubuntu for Ubuntu 22.04 deployment

Created generic Spark runner with S3 integration

Implemented non-root user execution for improved security

Added comprehensive documentation in UBUNTU_DOCKERFILE_GUIDE.md

🛠️ Code Quality:

Removed commented legacy code for DEEQU installation

Improved conditional framework installation logic

Better error handling and logging in build process

Standardized environment variable organization

📋 Framework Support:

Maintained compatibility with Delta, Hudi, Iceberg, and Deequ frameworks

Preserved all existing build arguments and configurations

Enhanced JAR download process with better error handling

Benefits
Reduced image size through multi-stage builds

Improved security with latest runtime versions and non-root execution

Better maintainability with cleaner, more organized code

Extended deployment options supporting both Lambda and Ubuntu environments

Enhanced developer experience with comprehensive documentation

Breaking Changes
Python runtime path changed from /var/lang/lib/python3.8/ to /var/lang/lib/python3.10/

Java runtime upgraded may require application compatibility testing
diff --git a/Dockerfile b/Dockerfile
@@ -1,85 +1,70 @@
-# Use AWS Lambda Python 3.8 image as base
-FROM public.ecr.aws/lambda/python:3.8
+# Multi-stage build for optimal size
+FROM public.ecr.aws/lambda/python:3.10 as builder
 
-# Setting the compatible versions of libraries
+# Build arguments - consolidated at top
 ARG HADOOP_VERSION=3.2.4
 ARG AWS_SDK_VERSION=1.11.901
 ARG PYSPARK_VERSION=3.3.0
-
-#FRAMEWORK will passed during the Docker build. For Apache Iceberg in somecase downgrading PYSPARK_VERSION to 3.2.0 will be good
 ARG FRAMEWORK
 ARG DELTA_FRAMEWORK_VERSION=2.2.0
 ARG HUDI_FRAMEWORK_VERSION=0.12.2
-
-
 ARG ICEBERG_FRAMEWORK_VERSION=3.3_2.12
 ARG ICEBERG_FRAMEWORK_SUB_VERSION=1.0.0
-
-
 ARG DEEQU_FRAMEWORK_VERSION=2.0.3-spark-3.3
 
-
-# Perform system updates and install dependencies
-RUN yum update -y && \
-    yum -y update zlib && \
-    yum -y install wget && \
-    yum -y install yum-plugin-versionlock && \
-    yum -y versionlock add java-1.8.0-openjdk-1.8.0.362.b08-0.amzn2.0.1.x86_64 && \
-    yum -y install java-1.8.0-openjdk && \
-    yum -y install unzip && \
-    pip install --upgrade pip && \
-    pip install pyspark==$PYSPARK_VERSION boto3 && \
-    yum clean all
-
-# Install pydeequ if FRAMEWORK is DEEQU
-#RUN if [ "$FRAMEWORK" = "DEEQU" ] ; then \
-#	pip install --no-deps pydeequ && \
-#	pip install pandas && \
-#	yum clean all; \
-#    else \
-#        echo FRAMEWORK is ${FRAMEWORK} ; \
-#    fi
-
-RUN echo "$FRAMEWORK" | grep -q "DEEQU" && \
-     pip install --no-deps pydeequ && \
-     pip install pandas && \
-     yum clean all && \
-     echo "DEEQU found in FRAMEWORK" || \
-     echo "DEEQU not found in FRAMEWORK"
-
-
-# Set environment variables for PySpark
-ENV SPARK_HOME="/var/lang/lib/python3.8/site-packages/pyspark"
-ENV SPARK_VERSION=3.3.0
-ENV PATH=$PATH:$SPARK_HOME/bin
-ENV PATH=$PATH:$SPARK_HOME/sbin
-ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip:/home/glue_functions:$PYTHONPATH
-ENV PATH=$SPARK_HOME/python:$PATH
-
-COPY download_jars.sh /tmp
+# Single consolidated RUN layer for all build operations
+COPY download_jars.sh /tmp/
+RUN set -ex && \
+    # System updates and package installation
+    yum update -y && \
+    yum install -y java-11-amazon-corretto-headless wget unzip && \
+    yum clean all && \
+    rm -rf /var/cache/yum && \
+    # Python package installation
+    pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir pyspark==$PYSPARK_VERSION boto3 && \
+    # Conditional DEEQU installation
+    (echo "$FRAMEWORK" | grep -q "DEEQU" && \
+     pip install --no-cache-dir --no-deps pydeequ && \
+     pip install --no-cache-dir pandas || \
+     echo "DEEQU not found in FRAMEWORK") && \
+    # JAR download and cleanup
+    chmod +x /tmp/download_jars.sh && \
+    SPARK_HOME="/var/lang/lib/python3.10/site-packages/pyspark" && \
+    /tmp/download_jars.sh $FRAMEWORK $SPARK_HOME $HADOOP_VERSION $AWS_SDK_VERSION $DELTA_FRAMEWORK_VERSION $HUDI_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_SUB_VERSION $DEEQU_FRAMEWORK_VERSION && \
+    rm -rf /tmp/* /var/tmp/*
+
+# Final optimized stage
+FROM public.ecr.aws/lambda/python:3.10
+
+# Single consolidated RUN layer for runtime setup
+COPY --from=builder /var/lang/lib/python3.10/site-packages/ /var/lang/lib/python3.10/site-packages/
+COPY --from=builder /var/runtime/ /var/runtime/
 COPY libs/glue_functions /home/glue_functions
-RUN chmod -R 755 /home/glue_functions
-
-RUN chmod +x /tmp/download_jars.sh && \
-    /tmp/download_jars.sh $FRAMEWORK $SPARK_HOME $HADOOP_VERSION $AWS_SDK_VERSION $DELTA_FRAMEWORK_VERSION $HUDI_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_SUB_VERSION $DEEQU_FRAMEWORK_VERSION
-
-ENV PATH=${PATH}:${JAVA_HOME}/bin
-
-# Setting  up the ENV vars for local code, in AWS LAmbda you have to set Input_path and Output_path
-ENV INPUT_PATH=""
-ENV OUTPUT_PATH=""
-ENV AWS_ACCESS_KEY_ID=""
-ENV AWS_SECRET_ACCESS_KEY=""
-ENV AWS_REGION=""
-ENV AWS_SESSION_TOKEN=""
-ENV CUSTOM_SQL=""
-
-# spark-class file is setting the memory to 1 GB
-COPY spark-class $SPARK_HOME/bin/
-RUN chmod -R 755 $SPARK_HOME
-
-# Copy the Pyspark script to container
+COPY spark-class /var/lang/lib/python3.10/site-packages/pyspark/bin/
 COPY sparkLambdaHandler.py ${LAMBDA_TASK_ROOT}
 
-# calling the Lambda handler
+RUN set -ex && \
+    # Install runtime Java and cleanup
+    yum update -y && \
+    yum install -y java-11-amazon-corretto-headless && \
+    yum clean all && \
+    rm -rf /var/cache/yum /tmp/* /var/tmp/* && \
+    # Set permissions in single operation
+    chmod -R 755 /home/glue_functions /var/lang/lib/python3.10/site-packages/pyspark
+
+# Consolidated environment variables
+ENV SPARK_HOME="/var/lang/lib/python3.10/site-packages/pyspark" \
+    SPARK_VERSION=3.3.0 \
+    JAVA_HOME="/usr/lib/jvm/java-11-amazon-corretto" \
+    PATH="$PATH:/var/lang/lib/python3.10/site-packages/pyspark/bin:/var/lang/lib/python3.10/site-packages/pyspark/sbin:/usr/lib/jvm/java-11-amazon-corretto/bin" \
+    PYTHONPATH="/var/lang/lib/python3.10/site-packages/pyspark/python:/var/lang/lib/python3.10/site-packages/pyspark/python/lib/py4j-0.10.9-src.zip:/home/glue_functions" \
+    INPUT_PATH="" \
+    OUTPUT_PATH="" \
+    AWS_ACCESS_KEY_ID="" \
+    AWS_SECRET_ACCESS_KEY="" \
+    AWS_REGION="" \
+    AWS_SESSION_TOKEN="" \
+    CUSTOM_SQL=""
+
 CMD [ "/var/task/sparkLambdaHandler.lambda_handler" ]
diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu
@@ -0,0 +1,174 @@
+# Multi-stage build for optimal size - Ubuntu version
+FROM ubuntu:22.04 as builder
+
+# Prevent interactive prompts during package installation
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Build arguments - consolidated at top
+ARG HADOOP_VERSION=3.2.4
+ARG AWS_SDK_VERSION=1.11.901
+ARG PYSPARK_VERSION=3.3.0
+ARG FRAMEWORK
+ARG DELTA_FRAMEWORK_VERSION=2.2.0
+ARG HUDI_FRAMEWORK_VERSION=0.12.2
+ARG ICEBERG_FRAMEWORK_VERSION=3.3_2.12
+ARG ICEBERG_FRAMEWORK_SUB_VERSION=1.0.0
+ARG DEEQU_FRAMEWORK_VERSION=2.0.3-spark-3.3
+
+# Single consolidated RUN layer for all build operations
+COPY download_jars.sh /tmp/
+RUN set -ex && \
+    # System updates and package installation
+    apt-get update && \
+    apt-get install -y \
+        python3.10 \
+        python3.10-dev \
+        python3.10-venv \
+        python3-pip \
+        openjdk-11-jre-headless \
+        wget \
+        unzip \
+        curl \
+        ca-certificates && \
+    # Create symbolic links for python
+    ln -sf /usr/bin/python3.10 /usr/bin/python3 && \
+    ln -sf /usr/bin/python3.10 /usr/bin/python && \
+    # Upgrade pip
+    python3 -m pip install --no-cache-dir --upgrade pip && \
+    # Python package installation
+    pip install --no-cache-dir pyspark==$PYSPARK_VERSION boto3 && \
+    # Conditional DEEQU installation
+    (echo "$FRAMEWORK" | grep -q "DEEQU" && \
+     pip install --no-cache-dir --no-deps pydeequ && \
+     pip install --no-cache-dir pandas || \
+     echo "DEEQU not found in FRAMEWORK") && \
+    # JAR download and cleanup
+    chmod +x /tmp/download_jars.sh && \
+    SPARK_HOME="/usr/local/lib/python3.10/dist-packages/pyspark" && \
+    /tmp/download_jars.sh $FRAMEWORK $SPARK_HOME $HADOOP_VERSION $AWS_SDK_VERSION $DELTA_FRAMEWORK_VERSION $HUDI_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_SUB_VERSION $DEEQU_FRAMEWORK_VERSION && \
+    # Cleanup
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /root/.cache
+
+# Final optimized stage
+FROM ubuntu:22.04
+
+# Prevent interactive prompts
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Copy Python packages and runtime from builder
+COPY --from=builder /usr/local/lib/python3.10/dist-packages/ /usr/local/lib/python3.10/dist-packages/
+COPY --from=builder /usr/bin/python* /usr/bin/
+COPY --from=builder /usr/lib/python3.10/ /usr/lib/python3.10/
+COPY --from=builder /usr/lib/jvm/java-11-openjdk-amd64/ /usr/lib/jvm/java-11-openjdk-amd64/
+
+# Copy application files
+COPY libs/glue_functions /opt/spark/glue_functions
+COPY spark-class /usr/local/lib/python3.10/dist-packages/pyspark/bin/
+COPY sparkLambdaHandler.py /opt/spark/
+
+# Create a generic Spark runner script for Ubuntu
+RUN set -ex && \
+    # Install minimal runtime dependencies
+    apt-get update && \
+    apt-get install -y \
+        python3.10-minimal \
+        ca-certificates \
+        && \
+    # Create symbolic links
+    ln -sf /usr/bin/python3.10 /usr/bin/python3 && \
+    ln -sf /usr/bin/python3.10 /usr/bin/python && \
+    # Set permissions
+    chmod -R 755 /opt/spark/glue_functions /usr/local/lib/python3.10/dist-packages/pyspark && \
+    # Create non-root user for security
+    useradd -r -s /bin/false -d /opt/spark spark && \
+    chown -R spark:spark /opt/spark && \
+    # Cleanup
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# Create a generic Spark application runner
+RUN cat > /opt/spark/run_spark.py << 'EOF'
+#!/usr/bin/env python3
+import os
+import sys
+import subprocess
+import boto3
+from pathlib import Path
+
+def download_script_from_s3(bucket, key, local_path):
+    """Download Spark script from S3"""
+    s3_client = boto3.client('s3')
+    s3_client.download_file(bucket, key, local_path)
+    return local_path
+
+def run_spark_script(script_path, *args):
+    """Run Spark script using spark-submit"""
+    spark_home = os.environ.get('SPARK_HOME')
+    spark_submit = os.path.join(spark_home, 'bin', 'spark-submit')
+    
+    cmd = [spark_submit, script_path] + list(args)
+    
+    print(f"Running: {' '.join(cmd)}")
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    
+    print("STDOUT:", result.stdout)
+    if result.stderr:
+        print("STDERR:", result.stderr)
+    
+    return result.returncode
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python run_spark.py <script_path_or_s3_uri> [args...]")
+        sys.exit(1)
+    
+    script_arg = sys.argv[1]
+    script_args = sys.argv[2:] if len(sys.argv) > 2 else []
+    
+    # Check if it's an S3 URI
+    if script_arg.startswith('s3://'):
+        # Parse S3 URI
+        parts = script_arg[5:].split('/', 1)
+        bucket = parts[0]
+        key = parts[1]
+        
+        # Download to temporary location
+        local_script = f"/tmp/{Path(key).name}"
+        download_script_from_s3(bucket, key, local_script)
+        script_path = local_script
+    else:
+        script_path = script_arg
+    
+    # Run the Spark script
+    exit_code = run_spark_script(script_path, *script_args)
+    sys.exit(exit_code)
+
+if __name__ == "__main__":
+    main()
+EOF
+
+RUN chmod +x /opt/spark/run_spark.py
+
+# Switch to non-root user
+USER spark
+
+# Consolidated environment variables
+ENV SPARK_HOME="/usr/local/lib/python3.10/dist-packages/pyspark" \
+    SPARK_VERSION=3.3.0 \
+    JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64" \
+    PATH="/usr/local/lib/python3.10/dist-packages/pyspark/bin:/usr/local/lib/python3.10/dist-packages/pyspark/sbin:/usr/lib/jvm/java-11-openjdk-amd64/bin:$PATH" \
+    PYTHONPATH="/usr/local/lib/python3.10/dist-packages/pyspark/python:/usr/local/lib/python3.10/dist-packages/pyspark/python/lib/py4j-0.10.9-src.zip:/opt/spark/glue_functions" \
+    INPUT_PATH="" \
+    OUTPUT_PATH="" \
+    AWS_ACCESS_KEY_ID="" \
+    AWS_SECRET_ACCESS_KEY="" \
+    AWS_REGION="" \
+    AWS_SESSION_TOKEN="" \
+    CUSTOM_SQL=""
+
+# Set working directory
+WORKDIR /opt/spark
+
+# Default command - run interactive Python with PySpark available
+CMD ["python3", "/opt/spark/run_spark.py"]
diff --git a/UBUNTU_DOCKERFILE_GUIDE.md b/UBUNTU_DOCKERFILE_GUIDE.md