Skip to content

Commit 9b0d91b

Browse files
authored
chore(precompiled/hadoop): Add image for precompiled hadoop for later reuse (#1466)
* chore: Move hadoop/hadoop source build to precompiled/hadoop * chore: Update relative path (hadoop/hadoop -> precompiled/hadoop) * ci(precompiled/hadoop): Compile hadoop source and publish as an image * chore(hadoop): Restore hadoop/hadoop This will be removed in the Part 2 PR * ci(precompiled/hadoop): Make zizmor happy * chore(nix): Add zizmor dependency * chore: Update changelog
1 parent 37acdf2 commit 9b0d91b

31 files changed

Lines changed: 3895 additions & 8 deletions
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
---
2+
name: Compile Hadoop
3+
run-name: |
4+
Compile Hadoop (attempt #${{ github.run_attempt }})
5+
6+
on:
7+
workflow_dispatch:
8+
push:
9+
branches: [main]
10+
paths:
11+
- precompiled/hadoop/**
12+
# I don't think there's any reason to rebuild just because the workflow changed.
13+
# If the version is compiled (for that patch set), then it needs no rebuild.
14+
# - .github/actions/**
15+
# - .github/workflows/precompile_hadoop.yaml
16+
# - .github/workflows/reusable_build_image.yaml
17+
18+
permissions: {}
19+
20+
jobs:
21+
# This is a separate job so that it remains consistent if a rerun of failed jobs is needed.
22+
# It is used in place of the "sdp-version" passed to the build action.
23+
generate_build_timestamp:
24+
name: Generate unix timestamp
25+
runs-on: ubuntu-latest
26+
steps:
27+
- shell: bash
28+
id: unix_timestamp
29+
run: |
30+
set -euo pipefail
31+
UNIX_TIMESTAMP=$(date +%s)
32+
echo "unix_timestamp=$UNIX_TIMESTAMP" | tee -a "$GITHUB_OUTPUT"
33+
outputs:
34+
unix_timestamp: ${{ steps.unix_timestamp.outputs.unix_timestamp }}
35+
36+
build_image:
37+
name: Reusable Workflow
38+
uses: ./.github/workflows/reusable_build_image.yaml
39+
needs: [generate_build_timestamp]
40+
secrets:
41+
harbor-robot-secret: ${{ secrets.HARBOR_ROBOT_PRECOMPILED_GITHUB_ACTION_BUILD_SECRET }}
42+
slack-token: ${{ secrets.SLACK_CONTAINER_IMAGE_TOKEN }}
43+
permissions:
44+
# Needed for cosign (sign and attest)
45+
id-token: write
46+
# Needed for checkout
47+
contents: read
48+
with:
49+
product-name: hadoop
50+
sdp-version: ${{ needs.generate_build_timestamp.outputs.unix_timestamp }}
51+
registry-namespace: precompiled

.github/workflows/reusable_build_image.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,8 @@ jobs:
146146
name: Failure Notification
147147
needs: [generate_version_dimension, build, publish_manifests]
148148
runs-on: ubuntu-latest
149+
# TODO (@NickLarsenNZ): Allow a condition from input so that we can always
150+
# be notified of new builds for precompiled product images.
149151
if: failure() || (github.run_attempt > 1 && !cancelled())
150152
steps:
151153
- name: Send Notification

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
44

55
## [Unreleased]
66

7+
### Added
8+
9+
- hadoop: Add precompiled hadoop for later reuse in dependent images ([#1466]).
10+
711
### Changed
812

913
- hbase: Update `hbase-opa-authorizer` from `0.1.0` to `0.2.0` and then `0.3.0` ([#1446], [#1454]).
@@ -17,6 +21,7 @@ All notable changes to this project will be documented in this file.
1721
[#1452]: https://github.com/stackabletech/docker-images/pull/1452
1822
[#1453]: https://github.com/stackabletech/docker-images/pull/1453
1923
[#1454]: https://github.com/stackabletech/docker-images/pull/1454
24+
[#1466]: https://github.com/stackabletech/docker-images/pull/1466
2025

2126
## [26.3.0] - 2026-03-16
2227

precompiled/hadoop/Dockerfile

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# syntax=docker/dockerfile:1.16.0@sha256:e2dd261f92e4b763d789984f6eab84be66ab4f5f08052316d8eb8f173593acf7
2+
# check=error=true
3+
4+
FROM local-image/java-devel AS hadoop-builder
5+
6+
ARG PRODUCT_VERSION
7+
ARG RELEASE_VERSION
8+
ARG PROTOBUF_VERSION
9+
ARG STACKABLE_USER_UID
10+
11+
WORKDIR /stackable
12+
13+
COPY --chown=${STACKABLE_USER_UID}:0 shared/protobuf/stackable/patches/patchable.toml /stackable/src/shared/protobuf/stackable/patches/patchable.toml
14+
COPY --chown=${STACKABLE_USER_UID}:0 shared/protobuf/stackable/patches/${PROTOBUF_VERSION} /stackable/src/shared/protobuf/stackable/patches/${PROTOBUF_VERSION}
15+
16+
RUN <<EOF
17+
rpm --install --replacepkgs https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
18+
microdnf update
19+
# boost is a build dependency starting in Hadoop 3.4.0 if compiling native code
20+
# automake and libtool are required to build protobuf
21+
microdnf install boost1.78-devel automake libtool
22+
microdnf clean all
23+
rm -rf /var/cache/yum
24+
mkdir /opt/protobuf
25+
chown ${STACKABLE_USER_UID}:0 /opt/protobuf
26+
EOF
27+
28+
USER ${STACKABLE_USER_UID}
29+
# This Protobuf version is the exact version as used in the Hadoop Dockerfile
30+
# See https://github.com/apache/hadoop/blob/trunk/dev-support/docker/pkg-resolver/install-protobuf.sh
31+
# (this was hardcoded in the Dockerfile in earlier versions of Hadoop, make sure to look at the exact version in Github)
32+
RUN <<EOF
33+
cd "$(/stackable/patchable --images-repo-root=src checkout shared/protobuf ${PROTOBUF_VERSION})"
34+
35+
# Create snapshot of the source code including custom patches
36+
tar -czf /stackable/protobuf-${PROTOBUF_VERSION}-src.tar.gz .
37+
38+
./autogen.sh
39+
./configure --prefix=/opt/protobuf
40+
make "-j$(nproc)"
41+
make install
42+
(cd .. && rm -r ${PROTOBUF_VERSION})
43+
EOF
44+
45+
ENV PROTOBUF_HOME=/opt/protobuf
46+
ENV PATH="${PATH}:/opt/protobuf/bin"
47+
48+
WORKDIR /build
49+
COPY --chown=${STACKABLE_USER_UID}:0 precompiled/hadoop/stackable/patches/patchable.toml /build/src/precompiled/hadoop/stackable/patches/patchable.toml
50+
COPY --chown=${STACKABLE_USER_UID}:0 precompiled/hadoop/stackable/patches/${PRODUCT_VERSION} /build/src/precompiled/hadoop/stackable/patches/${PRODUCT_VERSION}
51+
COPY --chown=${STACKABLE_USER_UID}:0 precompiled/hadoop/stackable/fuse_dfs_wrapper /build
52+
USER ${STACKABLE_USER_UID}
53+
# Hadoop Pipes requires libtirpc to build, whose headers are not packaged in RedHat UBI, so skip building this module
54+
# Build from source to enable FUSE module, and to apply custom patches.
55+
# Also skip building the yarn, mapreduce and minicluster modules: this will result in the modules being excluded but not all
56+
# jar files will be stripped if they are needed elsewhere e.g. share/hadoop/yarn will not be part of the build, but yarn jars
57+
# will still exist in share/hadoop/tools as they would be needed by the resource estimator tool. Such jars are removed in a later step.
58+
RUN <<EOF
59+
cd "$(/stackable/patchable --images-repo-root=src checkout precompiled/hadoop ${PRODUCT_VERSION})"
60+
61+
ORIGINAL_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
62+
NEW_VERSION=${PRODUCT_VERSION}-stackable${RELEASE_VERSION}
63+
64+
mvn versions:set -DnewVersion=${NEW_VERSION}
65+
66+
# Since we skip building the hadoop-pipes module, we need to set the version to the original version so it can be pulled from Maven Central
67+
sed -e '/<artifactId>hadoop-pipes<\/artifactId>/,/<\/dependency>/ { s/<version>.*<\/version>/<version>'"$ORIGINAL_VERSION"'<\/version>/ }' -i hadoop-tools/hadoop-tools-dist/pom.xml
68+
69+
# Create snapshot of the source code including custom patches
70+
tar -czf /stackable/hadoop-${NEW_VERSION}-src.tar.gz .
71+
72+
# We do not pass require.snappy because that is only built in to the MapReduce client and we don't need that
73+
#
74+
# Passing require.openssl SHOULD make the build fail if OpenSSL is not present.
75+
# This does not work properly however because this builder image contains the openssl-devel package which creates a symlink from /usr/lib64/libcrypto.so to the real version.
76+
# Therefore, this build does work but the final image does NOT contain the openssl-devel package which is why it fails there which is why we have to create the symlink over there manually.
77+
# We still leave this flag in to automatically fail should anything with the packages or symlinks ever fail.
78+
mvn \
79+
clean package install \
80+
-Pdist,native \
81+
-pl '!hadoop-tools/hadoop-pipes' \
82+
-Dhadoop.version=${NEW_VERSION} \
83+
-Drequire.fuse=true \
84+
-Drequire.openssl=true \
85+
-DskipTests \
86+
-Dmaven.javadoc.skip=true
87+
88+
mkdir -p /stackable/patched-libs/maven/org/apache
89+
cp -r /stackable/.m2/repository/org/apache/hadoop /stackable/patched-libs/maven/org/apache
90+
91+
rm -rf hadoop-dist/target/hadoop-${NEW_VERSION}/share/hadoop/yarn
92+
rm -rf hadoop-dist/target/hadoop-${NEW_VERSION}/share/hadoop/mapreduce
93+
rm hadoop-dist/target/hadoop-${NEW_VERSION}/share/hadoop/client/hadoop-client-minicluster-*.jar
94+
rm hadoop-dist/target/hadoop-${NEW_VERSION}/share/hadoop/tools/lib/hadoop-minicluster-*.jar
95+
96+
cp -r hadoop-dist/target/hadoop-${NEW_VERSION} /stackable/hadoop-${NEW_VERSION}
97+
sed -i "s/${NEW_VERSION}/${ORIGINAL_VERSION}/g" hadoop-dist/target/bom.json
98+
mv hadoop-dist/target/bom.json /stackable/hadoop-${NEW_VERSION}/hadoop-${NEW_VERSION}.cdx.json
99+
100+
# HDFS fuse-dfs is not part of the regular dist output, so we need to copy it in ourselves
101+
cp hadoop-hdfs-project/hadoop-hdfs-native-client/target/main/native/fuse-dfs/fuse_dfs /stackable/hadoop-${NEW_VERSION}/bin
102+
103+
# Remove source code
104+
(cd .. && rm -r ${PRODUCT_VERSION})
105+
106+
ln -s /stackable/hadoop-${NEW_VERSION} /stackable/hadoop
107+
108+
mv /build/fuse_dfs_wrapper /stackable/hadoop/bin
109+
110+
# Remove unneeded binaries:
111+
# - code sources
112+
# - mapreduce/yarn binaries that were built as cross-project dependencies
113+
# - minicluster (only used for testing) and test .jars
114+
# - json-io: this is a transitive dependency pulled in by cedarsoft/java-utils/json-io and is excluded in 3.4.0. See CVE-2023-34610.
115+
rm -rf /stackable/hadoop/share/hadoop/common/sources/
116+
rm -rf /stackable/hadoop/share/hadoop/hdfs/sources/
117+
rm -rf /stackable/hadoop/share/hadoop/tools/sources/
118+
rm -rf /stackable/hadoop/share/hadoop/tools/lib/json-io-*.jar
119+
rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-mapreduce-client-*.jar
120+
rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-yarn-server*.jar
121+
find /stackable/hadoop -name 'hadoop-minicluster-*.jar' -type f -delete
122+
find /stackable/hadoop -name 'hadoop-client-minicluster-*.jar' -type f -delete
123+
find /stackable/hadoop -name 'hadoop-*tests.jar' -type f -delete
124+
rm -rf /stackable/.m2
125+
126+
# Set correct groups; make sure only required artifacts for the final image are located in /stackable
127+
chmod -R g=u /stackable
128+
EOF
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[versions."3.3.6".local-images]
2+
java-devel = "11"
3+
4+
[versions."3.3.6".build-arguments]
5+
protobuf-version = "3.7.1"
6+
7+
[versions."3.4.2".local-images]
8+
java-devel = "11"
9+
10+
[versions."3.4.2".build-arguments]
11+
protobuf-version = "3.7.1"
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/usr/bin/env bash
2+
3+
set -e
4+
5+
# JNI does _NOT_ support wildcards in the Classpath so we can't use the usual /stackable/hadoop/share/hadoop/client/* here
6+
# Instead we need to use find and then concatenate them all with colons.
7+
# There is a trailing colon at the end but that does seem to work just fine
8+
# Not all three directories might be needed, common is definitely needed
9+
CLASSPATH=$(find -L /stackable/hadoop/share/hadoop/client/ /stackable/hadoop/share/hadoop/hdfs/ /stackable/hadoop/share/hadoop/common -type f -name "*.jar" -print0 | xargs -0 printf "%s:")
10+
11+
export CLASSPATH=$HADOOP_CONF_DIR:$CLASSPATH
12+
export LD_LIBRARY_PATH=/stackable/hadoop/lib/native:/usr/lib/jvm/jre/lib/server
13+
export PATH="${PATH}":/stackable/hadoop/bin
14+
export HADOOP_HOME=/stackable/hadoop
15+
16+
fuse_dfs "$@"
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
From bd2fa3a3a5ef57c5f6ca4f0e5535a1cd875e50d1 Mon Sep 17 00:00:00 2001
2+
From: Siegfried Weber <mail@siegfriedweber.net>
3+
Date: Thu, 21 Dec 2023 13:51:13 +0100
4+
Subject: YARN-11527: Update node.js
5+
6+
---
7+
hadoop-project/pom.xml | 2 +-
8+
1 file changed, 1 insertion(+), 1 deletion(-)
9+
10+
diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml
11+
index f1ac43ed5b..9b01858e0e 100644
12+
--- a/hadoop-project/pom.xml
13+
+++ b/hadoop-project/pom.xml
14+
@@ -213,7 +213,7 @@
15+
<openssl-wildfly.version>1.1.3.Final</openssl-wildfly.version>
16+
<woodstox.version>5.4.0</woodstox.version>
17+
<nimbus-jose-jwt.version>9.8.1</nimbus-jose-jwt.version>
18+
- <nodejs.version>v12.22.1</nodejs.version>
19+
+ <nodejs.version>v14.17.0</nodejs.version>
20+
<yarnpkg.version>v1.22.5</yarnpkg.version>
21+
<apache-ant.version>1.10.13</apache-ant.version>
22+
<jmh.version>1.20</jmh.version>

0 commit comments

Comments
 (0)