Skip to content

Commit 9896ec6

Browse files
committed
dockerfile optimizations
1 parent 820fa92 commit 9896ec6

File tree

3 files changed

+490
-70
lines changed

3 files changed

+490
-70
lines changed

Dockerfile

Lines changed: 55 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,85 +1,70 @@
1-
# Use AWS Lambda Python 3.8 image as base
2-
FROM public.ecr.aws/lambda/python:3.8
1+
# Multi-stage build for optimal size
2+
FROM public.ecr.aws/lambda/python:3.10 as builder
33

4-
# Setting the compatible versions of libraries
4+
# Build arguments - consolidated at top
55
ARG HADOOP_VERSION=3.2.4
66
ARG AWS_SDK_VERSION=1.11.901
77
ARG PYSPARK_VERSION=3.3.0
8-
9-
#FRAMEWORK will passed during the Docker build. For Apache Iceberg in somecase downgrading PYSPARK_VERSION to 3.2.0 will be good
108
ARG FRAMEWORK
119
ARG DELTA_FRAMEWORK_VERSION=2.2.0
1210
ARG HUDI_FRAMEWORK_VERSION=0.12.2
13-
14-
1511
ARG ICEBERG_FRAMEWORK_VERSION=3.3_2.12
1612
ARG ICEBERG_FRAMEWORK_SUB_VERSION=1.0.0
17-
18-
1913
ARG DEEQU_FRAMEWORK_VERSION=2.0.3-spark-3.3
2014

21-
22-
# Perform system updates and install dependencies
23-
RUN yum update -y && \
24-
yum -y update zlib && \
25-
yum -y install wget && \
26-
yum -y install yum-plugin-versionlock && \
27-
yum -y versionlock add java-1.8.0-openjdk-1.8.0.362.b08-0.amzn2.0.1.x86_64 && \
28-
yum -y install java-1.8.0-openjdk && \
29-
yum -y install unzip && \
30-
pip install --upgrade pip && \
31-
pip install pyspark==$PYSPARK_VERSION boto3 && \
32-
yum clean all
33-
34-
# Install pydeequ if FRAMEWORK is DEEQU
35-
#RUN if [ "$FRAMEWORK" = "DEEQU" ] ; then \
36-
# pip install --no-deps pydeequ && \
37-
# pip install pandas && \
38-
# yum clean all; \
39-
# else \
40-
# echo FRAMEWORK is ${FRAMEWORK} ; \
41-
# fi
42-
43-
RUN echo "$FRAMEWORK" | grep -q "DEEQU" && \
44-
pip install --no-deps pydeequ && \
45-
pip install pandas && \
46-
yum clean all && \
47-
echo "DEEQU found in FRAMEWORK" || \
48-
echo "DEEQU not found in FRAMEWORK"
49-
50-
51-
# Set environment variables for PySpark
52-
ENV SPARK_HOME="/var/lang/lib/python3.8/site-packages/pyspark"
53-
ENV SPARK_VERSION=3.3.0
54-
ENV PATH=$PATH:$SPARK_HOME/bin
55-
ENV PATH=$PATH:$SPARK_HOME/sbin
56-
ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip:/home/glue_functions:$PYTHONPATH
57-
ENV PATH=$SPARK_HOME/python:$PATH
58-
59-
COPY download_jars.sh /tmp
15+
# Single consolidated RUN layer for all build operations
16+
COPY download_jars.sh /tmp/
17+
RUN set -ex && \
18+
# System updates and package installation
19+
yum update -y && \
20+
yum install -y java-11-amazon-corretto-headless wget unzip && \
21+
yum clean all && \
22+
rm -rf /var/cache/yum && \
23+
# Python package installation
24+
pip install --no-cache-dir --upgrade pip && \
25+
pip install --no-cache-dir pyspark==$PYSPARK_VERSION boto3 && \
26+
# Conditional DEEQU installation
27+
(echo "$FRAMEWORK" | grep -q "DEEQU" && \
28+
pip install --no-cache-dir --no-deps pydeequ && \
29+
pip install --no-cache-dir pandas || \
30+
echo "DEEQU not found in FRAMEWORK") && \
31+
# JAR download and cleanup
32+
chmod +x /tmp/download_jars.sh && \
33+
SPARK_HOME="/var/lang/lib/python3.10/site-packages/pyspark" && \
34+
/tmp/download_jars.sh $FRAMEWORK $SPARK_HOME $HADOOP_VERSION $AWS_SDK_VERSION $DELTA_FRAMEWORK_VERSION $HUDI_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_SUB_VERSION $DEEQU_FRAMEWORK_VERSION && \
35+
rm -rf /tmp/* /var/tmp/*
36+
37+
# Final optimized stage
38+
FROM public.ecr.aws/lambda/python:3.10
39+
40+
# Single consolidated RUN layer for runtime setup
41+
COPY --from=builder /var/lang/lib/python3.10/site-packages/ /var/lang/lib/python3.10/site-packages/
42+
COPY --from=builder /var/runtime/ /var/runtime/
6043
COPY libs/glue_functions /home/glue_functions
61-
RUN chmod -R 755 /home/glue_functions
62-
63-
RUN chmod +x /tmp/download_jars.sh && \
64-
/tmp/download_jars.sh $FRAMEWORK $SPARK_HOME $HADOOP_VERSION $AWS_SDK_VERSION $DELTA_FRAMEWORK_VERSION $HUDI_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_SUB_VERSION $DEEQU_FRAMEWORK_VERSION
65-
66-
ENV PATH=${PATH}:${JAVA_HOME}/bin
67-
68-
# Setting up the ENV vars for local code, in AWS LAmbda you have to set Input_path and Output_path
69-
ENV INPUT_PATH=""
70-
ENV OUTPUT_PATH=""
71-
ENV AWS_ACCESS_KEY_ID=""
72-
ENV AWS_SECRET_ACCESS_KEY=""
73-
ENV AWS_REGION=""
74-
ENV AWS_SESSION_TOKEN=""
75-
ENV CUSTOM_SQL=""
76-
77-
# spark-class file is setting the memory to 1 GB
78-
COPY spark-class $SPARK_HOME/bin/
79-
RUN chmod -R 755 $SPARK_HOME
80-
81-
# Copy the Pyspark script to container
44+
COPY spark-class /var/lang/lib/python3.10/site-packages/pyspark/bin/
8245
COPY sparkLambdaHandler.py ${LAMBDA_TASK_ROOT}
8346

84-
# calling the Lambda handler
47+
RUN set -ex && \
48+
# Install runtime Java and cleanup
49+
yum update -y && \
50+
yum install -y java-11-amazon-corretto-headless && \
51+
yum clean all && \
52+
rm -rf /var/cache/yum /tmp/* /var/tmp/* && \
53+
# Set permissions in single operation
54+
chmod -R 755 /home/glue_functions /var/lang/lib/python3.10/site-packages/pyspark
55+
56+
# Consolidated environment variables
57+
ENV SPARK_HOME="/var/lang/lib/python3.10/site-packages/pyspark" \
58+
SPARK_VERSION=3.3.0 \
59+
JAVA_HOME="/usr/lib/jvm/java-11-amazon-corretto" \
60+
PATH="$PATH:/var/lang/lib/python3.10/site-packages/pyspark/bin:/var/lang/lib/python3.10/site-packages/pyspark/sbin:/usr/lib/jvm/java-11-amazon-corretto/bin" \
61+
PYTHONPATH="/var/lang/lib/python3.10/site-packages/pyspark/python:/var/lang/lib/python3.10/site-packages/pyspark/python/lib/py4j-0.10.9-src.zip:/home/glue_functions" \
62+
INPUT_PATH="" \
63+
OUTPUT_PATH="" \
64+
AWS_ACCESS_KEY_ID="" \
65+
AWS_SECRET_ACCESS_KEY="" \
66+
AWS_REGION="" \
67+
AWS_SESSION_TOKEN="" \
68+
CUSTOM_SQL=""
69+
8570
CMD [ "/var/task/sparkLambdaHandler.lambda_handler" ]

Dockerfile.ubuntu

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
# Multi-stage build for optimal size - Ubuntu version
2+
FROM ubuntu:22.04 as builder
3+
4+
# Prevent interactive prompts during package installation
5+
ENV DEBIAN_FRONTEND=noninteractive
6+
7+
# Build arguments - consolidated at top
8+
ARG HADOOP_VERSION=3.2.4
9+
ARG AWS_SDK_VERSION=1.11.901
10+
ARG PYSPARK_VERSION=3.3.0
11+
ARG FRAMEWORK
12+
ARG DELTA_FRAMEWORK_VERSION=2.2.0
13+
ARG HUDI_FRAMEWORK_VERSION=0.12.2
14+
ARG ICEBERG_FRAMEWORK_VERSION=3.3_2.12
15+
ARG ICEBERG_FRAMEWORK_SUB_VERSION=1.0.0
16+
ARG DEEQU_FRAMEWORK_VERSION=2.0.3-spark-3.3
17+
18+
# Single consolidated RUN layer for all build operations
19+
COPY download_jars.sh /tmp/
20+
RUN set -ex && \
21+
# System updates and package installation
22+
apt-get update && \
23+
apt-get install -y \
24+
python3.10 \
25+
python3.10-dev \
26+
python3.10-venv \
27+
python3-pip \
28+
openjdk-11-jre-headless \
29+
wget \
30+
unzip \
31+
curl \
32+
ca-certificates && \
33+
# Create symbolic links for python
34+
ln -sf /usr/bin/python3.10 /usr/bin/python3 && \
35+
ln -sf /usr/bin/python3.10 /usr/bin/python && \
36+
# Upgrade pip
37+
python3 -m pip install --no-cache-dir --upgrade pip && \
38+
# Python package installation
39+
pip install --no-cache-dir pyspark==$PYSPARK_VERSION boto3 && \
40+
# Conditional DEEQU installation
41+
(echo "$FRAMEWORK" | grep -q "DEEQU" && \
42+
pip install --no-cache-dir --no-deps pydeequ && \
43+
pip install --no-cache-dir pandas || \
44+
echo "DEEQU not found in FRAMEWORK") && \
45+
# JAR download and cleanup
46+
chmod +x /tmp/download_jars.sh && \
47+
SPARK_HOME="/usr/local/lib/python3.10/dist-packages/pyspark" && \
48+
/tmp/download_jars.sh $FRAMEWORK $SPARK_HOME $HADOOP_VERSION $AWS_SDK_VERSION $DELTA_FRAMEWORK_VERSION $HUDI_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_VERSION $ICEBERG_FRAMEWORK_SUB_VERSION $DEEQU_FRAMEWORK_VERSION && \
49+
# Cleanup
50+
apt-get clean && \
51+
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /root/.cache
52+
53+
# Final optimized stage
54+
FROM ubuntu:22.04
55+
56+
# Prevent interactive prompts
57+
ENV DEBIAN_FRONTEND=noninteractive
58+
59+
# Copy Python packages and runtime from builder
60+
COPY --from=builder /usr/local/lib/python3.10/dist-packages/ /usr/local/lib/python3.10/dist-packages/
61+
COPY --from=builder /usr/bin/python* /usr/bin/
62+
COPY --from=builder /usr/lib/python3.10/ /usr/lib/python3.10/
63+
COPY --from=builder /usr/lib/jvm/java-11-openjdk-amd64/ /usr/lib/jvm/java-11-openjdk-amd64/
64+
65+
# Copy application files
66+
COPY libs/glue_functions /opt/spark/glue_functions
67+
COPY spark-class /usr/local/lib/python3.10/dist-packages/pyspark/bin/
68+
COPY sparkLambdaHandler.py /opt/spark/
69+
70+
# Create a generic Spark runner script for Ubuntu
71+
RUN set -ex && \
72+
# Install minimal runtime dependencies
73+
apt-get update && \
74+
apt-get install -y \
75+
python3.10-minimal \
76+
ca-certificates \
77+
&& \
78+
# Create symbolic links
79+
ln -sf /usr/bin/python3.10 /usr/bin/python3 && \
80+
ln -sf /usr/bin/python3.10 /usr/bin/python && \
81+
# Set permissions
82+
chmod -R 755 /opt/spark/glue_functions /usr/local/lib/python3.10/dist-packages/pyspark && \
83+
# Create non-root user for security
84+
useradd -r -s /bin/false -d /opt/spark spark && \
85+
chown -R spark:spark /opt/spark && \
86+
# Cleanup
87+
apt-get clean && \
88+
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
89+
90+
# Create a generic Spark application runner
91+
RUN cat > /opt/spark/run_spark.py << 'EOF'
92+
#!/usr/bin/env python3
93+
import os
94+
import sys
95+
import subprocess
96+
import boto3
97+
from pathlib import Path
98+
99+
def download_script_from_s3(bucket, key, local_path):
100+
"""Download Spark script from S3"""
101+
s3_client = boto3.client('s3')
102+
s3_client.download_file(bucket, key, local_path)
103+
return local_path
104+
105+
def run_spark_script(script_path, *args):
106+
"""Run Spark script using spark-submit"""
107+
spark_home = os.environ.get('SPARK_HOME')
108+
spark_submit = os.path.join(spark_home, 'bin', 'spark-submit')
109+
110+
cmd = [spark_submit, script_path] + list(args)
111+
112+
print(f"Running: {' '.join(cmd)}")
113+
result = subprocess.run(cmd, capture_output=True, text=True)
114+
115+
print("STDOUT:", result.stdout)
116+
if result.stderr:
117+
print("STDERR:", result.stderr)
118+
119+
return result.returncode
120+
121+
def main():
122+
if len(sys.argv) < 2:
123+
print("Usage: python run_spark.py <script_path_or_s3_uri> [args...]")
124+
sys.exit(1)
125+
126+
script_arg = sys.argv[1]
127+
script_args = sys.argv[2:] if len(sys.argv) > 2 else []
128+
129+
# Check if it's an S3 URI
130+
if script_arg.startswith('s3://'):
131+
# Parse S3 URI
132+
parts = script_arg[5:].split('/', 1)
133+
bucket = parts[0]
134+
key = parts[1]
135+
136+
# Download to temporary location
137+
local_script = f"/tmp/{Path(key).name}"
138+
download_script_from_s3(bucket, key, local_script)
139+
script_path = local_script
140+
else:
141+
script_path = script_arg
142+
143+
# Run the Spark script
144+
exit_code = run_spark_script(script_path, *script_args)
145+
sys.exit(exit_code)
146+
147+
if __name__ == "__main__":
148+
main()
149+
EOF
150+
151+
RUN chmod +x /opt/spark/run_spark.py
152+
153+
# Switch to non-root user
154+
USER spark
155+
156+
# Consolidated environment variables
157+
ENV SPARK_HOME="/usr/local/lib/python3.10/dist-packages/pyspark" \
158+
SPARK_VERSION=3.3.0 \
159+
JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64" \
160+
PATH="/usr/local/lib/python3.10/dist-packages/pyspark/bin:/usr/local/lib/python3.10/dist-packages/pyspark/sbin:/usr/lib/jvm/java-11-openjdk-amd64/bin:$PATH" \
161+
PYTHONPATH="/usr/local/lib/python3.10/dist-packages/pyspark/python:/usr/local/lib/python3.10/dist-packages/pyspark/python/lib/py4j-0.10.9-src.zip:/opt/spark/glue_functions" \
162+
INPUT_PATH="" \
163+
OUTPUT_PATH="" \
164+
AWS_ACCESS_KEY_ID="" \
165+
AWS_SECRET_ACCESS_KEY="" \
166+
AWS_REGION="" \
167+
AWS_SESSION_TOKEN="" \
168+
CUSTOM_SQL=""
169+
170+
# Set working directory
171+
WORKDIR /opt/spark
172+
173+
# Default command - run interactive Python with PySpark available
174+
CMD ["python3", "/opt/spark/run_spark.py"]

0 commit comments

Comments
 (0)