|
| 1 | +# Multi-stage build for optimal size - Ubuntu version |
| 2 | +FROM ubuntu:22.04 as builder |
| 3 | + |
| 4 | +# Prevent interactive prompts during package installation |
| 5 | +ENV DEBIAN_FRONTEND=noninteractive |
| 6 | + |
| 7 | +# Build arguments - consolidated at top |
| 8 | +ARG HADOOP_VERSION=3.2.4 |
| 9 | +ARG AWS_SDK_VERSION=1.11.901 |
| 10 | +ARG PYSPARK_VERSION=3.3.0 |
| 11 | +ARG FRAMEWORK |
| 12 | +ARG DELTA_FRAMEWORK_VERSION=2.2.0 |
| 13 | +ARG HUDI_FRAMEWORK_VERSION=0.12.2 |
| 14 | +ARG ICEBERG_FRAMEWORK_VERSION=3.3_2.12 |
| 15 | +ARG ICEBERG_FRAMEWORK_SUB_VERSION=1.0.0 |
| 16 | +ARG DEEQU_FRAMEWORK_VERSION=2.0.3-spark-3.3 |
| 17 | + |
| 18 | +# Single consolidated RUN layer for all build operations |
| 19 | +COPY download_jars.sh /tmp/ |
| 20 | +RUN set -ex && \ |
| 21 | + # System updates and package installation |
| 22 | + apt-get update && \ |
| 23 | + apt-get install -y \ |
| 24 | + python3.10 \ |
| 25 | + python3.10-dev \ |
| 26 | + python3.10-venv \ |
| 27 | + python3-pip \ |
| 28 | + openjdk-11-jre-headless \ |
| 29 | + wget \ |
| 30 | + unzip \ |
| 31 | + curl \ |
| 32 | + ca-certificates && \ |
| 33 | + # Create symbolic links for python |
| 34 | + ln -sf /usr/bin/python3.10 /usr/bin/python3 && \ |
| 35 | + ln -sf /usr/bin/python3.10 /usr/bin/python && \ |
| 36 | + # Upgrade pip |
| 37 | + python3 -m pip install --no-cache-dir --upgrade pip && \ |
| 38 | + # Python package installation |
| 39 | + pip install --no-cache-dir pyspark==$PYSPARK_VERSION boto3 && \ |
| 40 | + # Conditional DEEQU installation |
| 41 | + (echo "$FRAMEWORK" | grep -q "DEEQU" && \ |
| 42 | + pip install --no-cache-dir --no-deps pydeequ && \ |
| 43 | + pip install --no-cache-dir pandas || \ |
| 44 | + echo "DEEQU not found in FRAMEWORK") && \ |
| 45 | + # JAR download and cleanup |
| 46 | + chmod +x /tmp/download_jars.sh && \ |
| 47 | + SPARK_HOME="/usr/local/lib/python3.10/dist-packages/pyspark" && \ |
| 48 | + /tmp/download_jars.sh $FRAMEWORK $SPARK_HOME $HADOOP_VERSION $AWS_SDK_VERSION $DELTA_FRAMEWORK_VERSION $HUDI_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_SUB_VERSION $DEEQU_FRAMEWORK_VERSION && \ |
| 49 | + # Cleanup |
| 50 | + apt-get clean && \ |
| 51 | + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /root/.cache |
| 52 | + |
| 53 | +# Final optimized stage |
| 54 | +FROM ubuntu:22.04 |
| 55 | + |
| 56 | +# Prevent interactive prompts |
| 57 | +ENV DEBIAN_FRONTEND=noninteractive |
| 58 | + |
| 59 | +# Copy Python packages and runtime from builder |
| 60 | +COPY --from=builder /usr/local/lib/python3.10/dist-packages/ /usr/local/lib/python3.10/dist-packages/ |
| 61 | +COPY --from=builder /usr/bin/python* /usr/bin/ |
| 62 | +COPY --from=builder /usr/lib/python3.10/ /usr/lib/python3.10/ |
| 63 | +COPY --from=builder /usr/lib/jvm/java-11-openjdk-amd64/ /usr/lib/jvm/java-11-openjdk-amd64/ |
| 64 | + |
| 65 | +# Copy application files |
| 66 | +COPY libs/glue_functions /opt/spark/glue_functions |
| 67 | +COPY spark-class /usr/local/lib/python3.10/dist-packages/pyspark/bin/ |
| 68 | +COPY sparkLambdaHandler.py /opt/spark/ |
| 69 | + |
| 70 | +# Create a generic Spark runner script for Ubuntu |
| 71 | +RUN set -ex && \ |
| 72 | + # Install minimal runtime dependencies |
| 73 | + apt-get update && \ |
| 74 | + apt-get install -y \ |
| 75 | + python3.10-minimal \ |
| 76 | + ca-certificates \ |
| 77 | + && \ |
| 78 | + # Create symbolic links |
| 79 | + ln -sf /usr/bin/python3.10 /usr/bin/python3 && \ |
| 80 | + ln -sf /usr/bin/python3.10 /usr/bin/python && \ |
| 81 | + # Set permissions |
| 82 | + chmod -R 755 /opt/spark/glue_functions /usr/local/lib/python3.10/dist-packages/pyspark && \ |
| 83 | + # Create non-root user for security |
| 84 | + useradd -r -s /bin/false -d /opt/spark spark && \ |
| 85 | + chown -R spark:spark /opt/spark && \ |
| 86 | + # Cleanup |
| 87 | + apt-get clean && \ |
| 88 | + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* |
| 89 | + |
| 90 | +# Create a generic Spark application runner |
| 91 | +RUN cat > /opt/spark/run_spark.py << 'EOF' |
| 92 | +#!/usr/bin/env python3 |
| 93 | +import os |
| 94 | +import sys |
| 95 | +import subprocess |
| 96 | +import boto3 |
| 97 | +from pathlib import Path |
| 98 | + |
| 99 | +def download_script_from_s3(bucket, key, local_path): |
| 100 | + """Download Spark script from S3""" |
| 101 | + s3_client = boto3.client('s3') |
| 102 | + s3_client.download_file(bucket, key, local_path) |
| 103 | + return local_path |
| 104 | + |
| 105 | +def run_spark_script(script_path, *args): |
| 106 | + """Run Spark script using spark-submit""" |
| 107 | + spark_home = os.environ.get('SPARK_HOME') |
| 108 | + spark_submit = os.path.join(spark_home, 'bin', 'spark-submit') |
| 109 | + |
| 110 | + cmd = [spark_submit, script_path] + list(args) |
| 111 | + |
| 112 | + print(f"Running: {' '.join(cmd)}") |
| 113 | + result = subprocess.run(cmd, capture_output=True, text=True) |
| 114 | + |
| 115 | + print("STDOUT:", result.stdout) |
| 116 | + if result.stderr: |
| 117 | + print("STDERR:", result.stderr) |
| 118 | + |
| 119 | + return result.returncode |
| 120 | + |
| 121 | +def main(): |
| 122 | + if len(sys.argv) < 2: |
| 123 | + print("Usage: python run_spark.py <script_path_or_s3_uri> [args...]") |
| 124 | + sys.exit(1) |
| 125 | + |
| 126 | + script_arg = sys.argv[1] |
| 127 | + script_args = sys.argv[2:] if len(sys.argv) > 2 else [] |
| 128 | + |
| 129 | + # Check if it's an S3 URI |
| 130 | + if script_arg.startswith('s3://'): |
| 131 | + # Parse S3 URI |
| 132 | + parts = script_arg[5:].split('/', 1) |
| 133 | + bucket = parts[0] |
| 134 | + key = parts[1] |
| 135 | + |
| 136 | + # Download to temporary location |
| 137 | + local_script = f"/tmp/{Path(key).name}" |
| 138 | + download_script_from_s3(bucket, key, local_script) |
| 139 | + script_path = local_script |
| 140 | + else: |
| 141 | + script_path = script_arg |
| 142 | + |
| 143 | + # Run the Spark script |
| 144 | + exit_code = run_spark_script(script_path, *script_args) |
| 145 | + sys.exit(exit_code) |
| 146 | + |
| 147 | +if __name__ == "__main__": |
| 148 | + main() |
| 149 | +EOF |
| 150 | + |
| 151 | +RUN chmod +x /opt/spark/run_spark.py |
| 152 | + |
| 153 | +# Switch to non-root user |
| 154 | +USER spark |
| 155 | + |
| 156 | +# Consolidated environment variables |
| 157 | +ENV SPARK_HOME="/usr/local/lib/python3.10/dist-packages/pyspark" \ |
| 158 | + SPARK_VERSION=3.3.0 \ |
| 159 | + JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64" \ |
| 160 | + PATH="/usr/local/lib/python3.10/dist-packages/pyspark/bin:/usr/local/lib/python3.10/dist-packages/pyspark/sbin:/usr/lib/jvm/java-11-openjdk-amd64/bin:$PATH" \ |
| 161 | + PYTHONPATH="/usr/local/lib/python3.10/dist-packages/pyspark/python:/usr/local/lib/python3.10/dist-packages/pyspark/python/lib/py4j-0.10.9-src.zip:/opt/spark/glue_functions" \ |
| 162 | + INPUT_PATH="" \ |
| 163 | + OUTPUT_PATH="" \ |
| 164 | + AWS_ACCESS_KEY_ID="" \ |
| 165 | + AWS_SECRET_ACCESS_KEY="" \ |
| 166 | + AWS_REGION="" \ |
| 167 | + AWS_SESSION_TOKEN="" \ |
| 168 | + CUSTOM_SQL="" |
| 169 | + |
| 170 | +# Set working directory |
| 171 | +WORKDIR /opt/spark |
| 172 | + |
| 173 | +# Default command - run interactive Python with PySpark available |
| 174 | +CMD ["python3", "/opt/spark/run_spark.py"] |
0 commit comments