IBM · reggeenr · Feb 12, 2026 · Feb 12, 2026 · Feb 12, 2026 · Feb 13, 2026
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,5 @@ node_modules
 
 # draw.io temp files
 .$*.bkp
-.$*.dtmp
+.$*.dtmp
+venv
diff --git a/metrics-collector/.ceignore b/metrics-collector/.ceignore
@@ -0,0 +1,2 @@
+images/
+setup/
diff --git a/metrics-collector/Dockerfile b/metrics-collector/Dockerfile
@@ -1,11 +1,51 @@
-FROM quay.io/projectquay/golang:1.23 AS build-env
+# Stage 1: Build Go binary
+FROM quay.io/projectquay/golang:1.25 AS go-builder
 WORKDIR /go/src/app
-COPY . .
-
+COPY go.mod go.sum ./
 RUN go mod download
-RUN CGO_ENABLED=0 go build -o /go/bin/app main.go
+COPY main.go ./
+RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o app main.go
+
+# Stage 2: Download and extract Prometheus
+FROM busybox:1.36-glibc AS prometheus-downloader
+ARG PROMETHEUS_VERSION=3.10.0
+ARG TARGETARCH=amd64
+
+WORKDIR /tmp
+RUN wget https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.linux-${TARGETARCH}.tar.gz && \
+    tar xzf prometheus-${PROMETHEUS_VERSION}.linux-${TARGETARCH}.tar.gz && \
+    mv prometheus-${PROMETHEUS_VERSION}.linux-${TARGETARCH}/prometheus /prometheus
+
+# Stage 3: Get CA certificates
+FROM alpine:latest AS certs
+RUN apk --no-cache add ca-certificates
+
+# Stage 4: Runtime image
+FROM busybox:1.36-glibc
+
+# Copy CA certificates for TLS verification
+COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt
+
+# Copy Go binary
+COPY --from=go-builder /go/src/app/app /app
+
+# Copy Prometheus binary
+COPY --from=prometheus-downloader /prometheus /bin/prometheus
+
+# Copy configuration and scripts
+COPY prometheus.yml.template /etc/prometheus/prometheus.yml.template
+COPY start.sh /start.sh
+RUN chmod +x /start.sh
+
+# Create necessary directories with proper permissions
+RUN mkdir -p /tmp/agent-data && \
+    mkdir -p /etc/secrets && \
+    chmod 777 /tmp/agent-data
+
+# Set SSL certificate path environment variable
+ENV SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt
+
+# Use non-root user
+USER 1000:1000
 
-# Copy the exe into a smaller base image
-FROM gcr.io/distroless/static-debian12
-COPY --from=build-env /go/bin/app /
-CMD ["/app"]
+ENTRYPOINT ["/start.sh"]
diff --git a/metrics-collector/README.md b/metrics-collector/README.md
@@ -1,59 +1,132 @@
 # IBM Cloud Code Engine - Metrics Collector
 
-Code Engine job that demonstrates how to collect resource metrics (CPU, memory and disk usage) of running Code Engine apps, jobs, and builds
+Code Engine job that demonstrates how to collect resource metrics (CPU, memory and disk usage) of running Code Engine apps, jobs, and builds. Those metrics can either be render 
+
+in **IBM Cloud Monitoring** (see [instructions](#Send-metrics-to-IBM-Cloud-Monitoring))
+
+![](./images/monitoring-dashboard-ce-component-resources.png)
+
+or in **IBM Cloud Logs** (see [instructions](#ibm-cloud-logs-setup))
 
 ![Dashboard overview](./images/icl-dashboard-overview.png)
 
-## Installation
 
-### Capture metrics every n seconds
+## Send metrics to IBM Cloud Monitoring
 
-* Create Code Engine job template
-```
-$ ibmcloud ce job create \
-    --name metrics-collector \
-    --src . \
-    --mode daemon \
-    --cpu 0.25 \
-    --memory 0.5G \
-    --wait
-```
+### How It Works
 
-* Submit a daemon job that collects metrics in an endless loop. The daemon job queries the Metrics API every 30 seconds
-```
-$ ibmcloud ce jobrun submit \
-    --job metrics-collector \
-    --env INTERVAL=30 
+![](./images/metrics-collector.overview.png)
+
+1. The metrics collector exposes Prometheus metrics on `localhost:9100/metrics`
+2. The embedded Prometheus agent scrapes these metrics every 30 seconds
+3. The agent also discovers and scrapes pods with the `codeengine.cloud.ibm.com/userMetricsScrape: 'true'` annotation
+4. All metrics are forwarded to IBM Cloud Monitoring via remote write
+5. If either the collector or Prometheus agent crashes, the container exits with a non-zero code to trigger a restart
+
+### Setup Instructions
+
+**Step 1:** You need an IBM Cloud Monitoring instance
+```bash
+REGION=<yourMonitoringInstanceRegion>
+MONITORING_INSTANCE_NAME="<yourMonitoringInstanceName>"
+MONITORING_INSTANCE_GUID=$(ibmcloud resource service-instance "$MONITORING_INSTANCE_NAME" -o JSON|jq -r '.[0].guid')
+echo "MONITORING_INSTANCE_GUID: '$MONITORING_INSTANCE_GUID'"
 ```
+**Step 2:** The collector must run in a Code Engine project
+```bash
+# Create new Code Engine project 
+ibmcloud ce project create --name <yourCodeEngineProjectName>
 
+# Select an existing Code Engine project
+ibmcloud ce project select --name <yourProjectName>
+```
 
-### Capture metrics every n minutes
+**Step 3:** Create a secret with your IBM Cloud Monitoring API token
+```bash
+# Obtain the Monitoring API token of the IBM Cloud Monitoring instance
+# using the IAM access token of the current IBM CLI Session
+MONITORING_INSTANCE_MONITORING_API_KEY=$(curl --silent -X GET https://$REGION.monitoring.cloud.ibm.com/api/token -H "Authorization: $(ibmcloud iam oauth-tokens --output JSON|jq -r '.iam_token')" -H "IBMInstanceID: $MONITORING_INSTANCE_GUID" -H "content-type: application/json"|jq -r '.token.key')
 
-* Create Code Engine job template
+# Create a Code Engine secret that stores the Monitoring API Key
+ibmcloud ce secret create \
+    --name monitoring-apikey \
+    --from-literal monitoring-apikey=$MONITORING_INSTANCE_MONITORING_API_KEY
 ```
-$ ibmcloud ce job create \
+
+**Step 4:** Create your metrics-collector job with the required configuration
+```bash
+ibmcloud ce job create \
     --name metrics-collector \
-    --src . \
-    --mode task \
+    --src "." \
+    --mode daemon \
     --cpu 0.25 \
     --memory 0.5G \
-    --wait
+    --service-account reader \
+    --build-size xlarge \
+    --env INTERVAL=30 \
+    --env METRICS_ENABLED=true \
+    --env METRICS_REMOTE_WRITE_FQDN=ingest.prws.private.${REGION}.monitoring.cloud.ibm.com \
+    --env CE_PROJECT_NAME="$(ibmcloud ce proj current --output json|jq -r '.name')" \
+    --mount-secret /etc/secrets=monitoring-apikey
 ```
 
-* Submit a Code Engine cron subscription that triggers the metrics collector every minute to query the Metrics API
+**Step 5:** Submit a daemon job run**
+```bash
+ibmcloud ce jobrun submit \
+    --job metrics-collector
 ```
-$ ibmcloud ce subscription cron create \
-    --name collect-metrics-every-minute \
-    --destination-type job \
-    --destination metrics-collector \
-    --schedule '*/1 * * * *'
+
+**Step 6:** Import the "IBM Cloud Code Engine - Component Resource Overview" dashboard
+```bash
+# Load the most recent dashboard configuration
+CE_MONITORING_DASHBOARD=$(curl -sL https://raw.githubusercontent.com/IBM/CodeEngine/main/metrics-collector/setup/ibm-cloud-monitoring/code-engine-component-resource-overview.json)
+
+# Import the dashboard
+curl -X POST https://$REGION.monitoring.cloud.ibm.com/api/v3/dashboards \
+      -H "Authorization: $(ibmcloud iam oauth-tokens --output JSON|jq -r '.iam_token')" \
+      -H "IBMInstanceID: $MONITORING_INSTANCE_GUID" \
+      -H "Content-Type: application/json" \
+      -d "{\"dashboard\": $CE_MONITORING_DASHBOARD}"
+
 ```
 
-## Configuration
+**Note:** A more elaborated approach to manage custom Cloud Monitoring dashboards can be found [here](setup/ibm-cloud-monitoring/README.md)
+
 
-Per default the metrics collector collects memory and CPU statistics, like `usage`, `current` and `configured`. 
+#### Exposed Metrics
 
-One can use the environment variable `COLLECT_DISKUSAGE=true` to also collect the amount of disk space that is used. Please note, the metrics collector can only calculate the overall file size stored in the pods filesystem which includes files that are part of the container image, the epheremal storage as well as mounted COS buckets. Hence, this metric cannot be used to calculate the ephemeral storage usage. 
+The following Prometheus metrics are exposed as gauges:
+
+Container Metrics:
+- **`ibm_codeengine_instance_cpu_usage_millicores`**: Current CPU usage in millicores
+- **`ibm_codeengine_instance_cpu_limit_millicores`**: Configured CPU limit in millicores
+- **`ibm_codeengine_instance_memory_usage_bytes`**: Current memory usage in bytes
+- **`ibm_codeengine_instance_memory_limit_bytes`**: Configured memory limit in bytes
+- **`ibm_codeengine_instance_ephemeral_storage_usage_bytes`**: Current ephemeral storage usage in bytes (if `COLLECT_DISKUSAGE=true`)
+
+The following 3 metrics are used to monitor the collector itself:
+- **`ibm_codeengine_collector_collection_duration_seconds`**: Time taken to collect metrics in seconds (if `METRICS_INTERNAL_STATS=true`)
+- **`ibm_codeengine_collector_last_collection_timestamp_seconds`**: Unix timestamp of last successful collection (if `METRICS_INTERNAL_STATS=true`)
+- **`ibm_codeengine_collector_collection_errors_total`**: Total number of collection errors (counter) (if `METRICS_INTERNAL_STATS=true`)
+
+#### Metric Labels
+
+All container metrics include the following labels:
+- `ibm_codeengine_instance_name`: Name of the pod instance
+- `ibm_codeengine_component_type`: Type of component (`app`, `job`, or `build`)
+- `ibm_codeengine_component_name`: Name of the Code Engine component
+
+#### Example Metrics Output
+
+```prometheus
+# HELP ibm_codeengine_instance_cpu_usage_millicores Current CPU usage in millicores
+# TYPE ibm_codeengine_instance_cpu_usage_millicores gauge
+ibm_codeengine_instance_cpu_usage_millicores{ibm_codeengine_instance_name="myapp-00001-deployment-abc123",ibm_codeengine_component_type="app",ibm_codeengine_component_name="myapp"} 250
+
+# HELP ibm_codeengine_instance_memory_usage_bytes Current memory usage in bytes
+# TYPE ibm_codeengine_instance_memory_usage_bytes gauge
+ibm_codeengine_instance_memory_usage_bytes{ibm_codeengine_instance_name="myapp-00001-deployment-abc123",ibm_codeengine_component_type="app",ibm_codeengine_component_name="myapp"} 134217728
+```
 
 ## IBM Cloud Logs setup
 
@@ -71,7 +144,7 @@ Follow the steps below to create a custom dashboard in your IBM Cloud Logs insta
 
 ![New dashboard](./images/icl-dashboard-new.png)
 
-* In the "Import" modal, select the file [./setup/dashboard-code_engine_resource_consumption_metrics.json](./setup/dashboard-code_engine_resource_consumption_metrics.json) located in this repository, and click "Import"
+* In the "Import" modal, select the file [./setup/ibm-cloud-logs/dashboard-code_engine_resource_consumption_metrics.json](./setup/ibm-cloud-logs/dashboard-code_engine_resource_consumption_metrics.json) located in this repository, and click "Import"
 
 ![Import modal](./images/icl-dashboard-import.png)
 
@@ -117,8 +190,6 @@ app:"codeengine" AND message.metric:"instance-resources"
 
 * In the top-right corner, select `1-line` as view mode
 
-![View](./images/icl-logs-view-mode.png)
-
 * In the graph title it says "**Count** all grouped by **Severity**". Click on `Severity` and select `message.component_name` instead. Furthermore, select `Max` as aggregation metric and choose `message.memory.usage` as aggregation field
 
 ![Graph](./images/icl-logs-view-graph.png)
@@ -132,54 +203,16 @@ app:"codeengine" AND message.metric:"instance-resources"
 ![Logs overview](./images/icl-logs-view-overview.png)
 
 
-## IBM Log Analysis setup (deprecated)
-
-### Log lines
-
-Along with a human readable message, like `Captured metrics of app instance 'load-generator-00001-deployment-677d5b7754-ktcf6': 3m vCPU, 109 MB memory, 50 MB ephemeral storage`, each log line passes specific resource utilization details in a structured way allowing to apply advanced filters on them.
-
-E.g.
-- `cpu.usage:>80`: Filter for all log lines that noticed a CPU utilization of 80% or higher
-- `memory.current:>1000`: Filter for all log lines that noticed an instance that used 1GB or higher of memory
-- `component_type:app`: Filter only for app instances. Possible values are `app`, `job`, and `build`
-- `component_name:<app-name>`: Filter for all instances of a specific app, job, or build
-- `name:<instance-name>`: Filter for a specific instance
-
-![IBM Cloud Logs](./images/ibm-cloud-logs--loglines.png)
-
-### Log graphs
-
-Best is to create IBM Cloud Logs Board, in order to visualize the CPU and Memory usage per Code Engine component.
-
-1. In your log instance navigate to Boards
-1. Give it a proper name, enter `metric:instance-resources` as query and submit by clicking `Add Graph`
-![New Board](./images/new-board.png)
-1. Now the graph shows the overall amount of logs captured for the specified query per time interval
-![Count of metrics log lines ](./images/count-of-metrics-lines.png)
-1. Click on the filter icon above the graph and put in `metric:instance-resources AND component_name:<app-name>`
-1. Switch the metric of the Graph to `Maximums`
-1. Below the graph Add a new plot`cpu.usage` as field and choose `ANY` as field values
-![Configure Graph plots](./images/configure-plots.png)
-1. Add another plot for the field `memory.usage` and values `ANY`
-1. Finally delete the plot `metrics:instance-resources` and adjust the plot colors to your likings
-![Resource Usage graph](./images/resource-usage-graph.png)
-1. The usage graph above renders the utilization in % of the CPU and Memory
-
-#### Add CPU utilization
-1. Duplicate the graph, change its name to CPU and replace its plots with `cpu.configured` and `cpu.current`.
-- The resulting graph will render the actual CPU usage compared to the configured limit. The the unit is milli vCPUs (1000 -> 1 vCPU).
-![](./images/cpu-utilization.png)
-
-
-#### Add memory utilization
-1. Duplicate the graph, change its name to Memory and replace its plots with `memory.configured` and `memory.current`.
-1. The resulting graph will render the actual memory usage compared to the configured limit. The the unit is MB (1000 -> 1 GB).
-![](./images/memory-utilization.png)
-
+### Troubleshooting & Configuration
 
+If the container fails to start with `METRICS_ENABLED=true`, check the logs for:
+- Missing `/etc/secrets/monitoring-apikey` file
+- Missing or wrong `METRICS_REMOTE_WRITE_FQDN` environment variable
 
-#### Add disk utilization
-1. Duplicate the graph or create a new one, change its name to "Disk usage" and replace its plots with `disk_usage.current`.
-1. The resulting graph will render the actual disk usage. While this does not allow to identify the usage of disk space compared with the configured ephemeral storage limit, this graph gives an impression on whether the disk usage is growing over time. The the unit is MB (1000 -> 1 GB).
-![](./images/disk-utilization.png)
+#### Environment Variables
 
+- **`INTERVAL`** (default: `30`): Collection interval in seconds (minimum 30 seconds). Controls how frequently metrics are collected from the Kubernetes API endpoint in daemon mode.
+- **`COLLECT_DISKUSAGE`** (default: `false`): Set to `true` to collect disk space usage. Note: The metrics collector calculates the overall file size stored in the pod's filesystem, which includes files from the container image, ephemeral storage, and mounted COS buckets. This metric cannot be used to calculate ephemeral storage usage alone.
+- **`METRICS_ENABLED`** (default: `false`): Set to `true` to enable the HTTP metrics server. When disabled, the collector still runs and logs metrics to stdout but does not expose the HTTP endpoint.
+- **`METRICS_REMOTE_WRITE_FQDN`**: IBM Cloud Monitoring ingestion endpoint FQDN (required when `METRICS_ENABLED=true`)
+- **`METRICS_PORT`** (default: `9100`): HTTP server port for the Prometheus metrics endpoint. Only used when `METRICS_ENABLED=true` in daemon mode.