From 2be1a74135ab947f7f06e4328bbdb07328a68fe9 Mon Sep 17 00:00:00 2001 From: Jackson Weber Date: Sat, 16 May 2026 21:15:09 -0700 Subject: [PATCH 01/14] feat(sdkstats): network statsbeat for A365 + OTLP exporters Ports microsoft/opentelemetry-distro-python#144 to TS. Records per-export success / failure / retry / throttle / exception counts and cumulative duration for the A365 and OTLP HTTP exporters, keyed by destination endpoint, and emits them via the existing standalone SDKStats pipeline. Architecture mirrors Python: single SdkStatsManager / MeterProvider / AzureMonitorStatsbeatExporter, single shared accumulator (networkStats.ts REQUESTS_MAP), every observation carries version = MICROSOFT_OPENTELEMETRY_VERSION. The recording exporter is distinguished on the wire only by the endpoint attribute. AzMon coexistence: when AzMon is enabled the standalone pipeline runs networkOnly=true so Feature / Feature.instrumentations gauges aren't double-emitted (AzMon long-interval owns them, with our distro feature bits bridged in via AZURE_MONITOR_STATSBEAT_FEATURES); the AzMon exporter's own network statsbeat continues running unchanged for AzMon-endpoint transmits. New files: - src/sdkstats/networkStats.ts (accumulator + record* helpers + THROTTLE_STATUS_CODES) - src/sdkstats/otlpWrapper.ts (NetworkStats{Span,Metric,Log}Exporter decorators) - NETWORK_SDKSTATS_PLAN.md (design doc) - 3 new test files (networkStats, otlpWrapper, agent365NetworkStats) Modified files: src/sdkstats/{metrics,manager,index}.ts (add networkOnly mode + 6 request_* gauges), src/otlp/handler.ts (wrap exporters when SDKStats enabled), src/a365/exporter/Agent365Exporter.ts (instrument postWithRetries), src/distro/distro.ts (always initialize with networkOnly=azureMonitorEnabled), plus 2 extended existing tests. Build clean; 844 unit tests pass (21 new). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/a365/exporter/Agent365Exporter.ts | 56 +++++ src/distro/distro.ts | 25 +- src/otlp/handler.ts | 31 ++- src/sdkstats/index.ts | 27 +++ src/sdkstats/manager.ts | 13 +- src/sdkstats/metrics.ts | 145 +++++++++++- src/sdkstats/networkStats.ts | 137 +++++++++++ src/sdkstats/otlpWrapper.ts | 202 ++++++++++++++++ .../unit/a365/agent365NetworkStats.test.ts | 177 ++++++++++++++ test/internal/unit/otlp/handler.test.ts | 28 +++ test/internal/unit/sdkstats/metrics.test.ts | 121 +++++++++- .../unit/sdkstats/networkStats.test.ts | 118 ++++++++++ .../unit/sdkstats/otlpWrapper.test.ts | 218 ++++++++++++++++++ 13 files changed, 1269 insertions(+), 29 deletions(-) create mode 100644 src/sdkstats/networkStats.ts create mode 100644 src/sdkstats/otlpWrapper.ts create mode 100644 test/internal/unit/a365/agent365NetworkStats.test.ts create mode 100644 test/internal/unit/sdkstats/networkStats.test.ts create mode 100644 test/internal/unit/sdkstats/otlpWrapper.test.ts diff --git a/src/a365/exporter/Agent365Exporter.ts b/src/a365/exporter/Agent365Exporter.ts index ce311ec..c958523 100644 --- a/src/a365/exporter/Agent365Exporter.ts +++ b/src/a365/exporter/Agent365Exporter.ts @@ -21,6 +21,16 @@ import { chunkBySize, } from "./utils.js"; import { getA365Logger } from "../logging.js"; +import { + THROTTLE_STATUS_CODES, + isSdkStatsEnabled, + recordDuration, + recordException, + recordFailure, + recordRetry, + recordSuccess, + recordThrottle, +} from "../../sdkstats/index.js"; const DEFAULT_MAX_RETRIES = 3; @@ -251,7 +261,21 @@ export class Agent365Exporter implements SpanExporter { ): Promise<{ ok: boolean; correlationId: string }> { let lastCorrelationId = "unknown"; + // Resolve the endpoint host (and the SDKStats kill-switch) once per + // call so each retry attempt records under the same key without + // re-parsing the URL or re-checking env on every iteration. + const recordA365Stats = isSdkStatsEnabled(); + let endpointHost = url; + if (recordA365Stats) { + try { + endpointHost = new URL(url).hostname || url; + } catch { + endpointHost = url; + } + } + for (let attempt = 0; attempt <= DEFAULT_MAX_RETRIES; attempt++) { + const startTime = Date.now(); try { const response = await fetch(url, { method: "POST", @@ -260,6 +284,10 @@ export class Agent365Exporter implements SpanExporter { signal: AbortSignal.timeout(this.options.httpRequestTimeoutMilliseconds), }); + if (recordA365Stats) { + recordDuration(endpointHost, (Date.now() - startTime) / 1000); + } + const correlationId = response.headers.get("x-ms-correlation-id") ?? response.headers.get("x-correlation-id") ?? @@ -267,6 +295,9 @@ export class Agent365Exporter implements SpanExporter { lastCorrelationId = correlationId; if (response.status >= 200 && response.status < 300) { + if (recordA365Stats) { + recordSuccess(endpointHost); + } return { ok: true, correlationId }; } @@ -275,6 +306,11 @@ export class Agent365Exporter implements SpanExporter { [408, 429].includes(response.status) || (response.status >= 500 && response.status < 600) ) { + if (recordA365Stats) { + // 402 (throttle) is not in the retryable set, so it never + // lands here — only true retries. + recordRetry(endpointHost, response.status); + } if (attempt < DEFAULT_MAX_RETRIES) { const sleepMs = 200 * (attempt + 1) + Math.floor(Math.random() * 100); this.logger.warn( @@ -283,6 +319,17 @@ export class Agent365Exporter implements SpanExporter { await sleep(sleepMs); continue; } + // Retries exhausted: also record a final failure so dashboards + // see this as a terminal failure (not just a retry blip). + if (recordA365Stats) { + recordFailure(endpointHost, response.status); + } + } else if (recordA365Stats) { + if (THROTTLE_STATUS_CODES.has(response.status)) { + recordThrottle(endpointHost, response.status); + } else { + recordFailure(endpointHost, response.status); + } } this.logger.error( @@ -290,6 +337,15 @@ export class Agent365Exporter implements SpanExporter { ); return { ok: false, correlationId }; } catch (error) { + if (recordA365Stats) { + recordDuration(endpointHost, (Date.now() - startTime) / 1000); + recordException( + endpointHost, + error instanceof Error + ? error.name || error.constructor.name || "Error" + : typeof error, + ); + } this.logger.error("[Agent365Exporter] Request error:", error); if (attempt < DEFAULT_MAX_RETRIES) { await sleep(200 * (attempt + 1)); diff --git a/src/distro/distro.ts b/src/distro/distro.ts index cc5aa92..ac3bde4 100644 --- a/src/distro/distro.ts +++ b/src/distro/distro.ts @@ -376,16 +376,21 @@ export function useMicrosoftOpenTelemetry(options?: MicrosoftOpenTelemetryOption isShutdown = false; sdk.start(); - // ── SDKStats: standalone pipeline for non-Azure-Monitor paths ───── - // When Azure Monitor is enabled the exporter package emits SDKStats - // itself (reading bits set above via `AZURE_MONITOR_STATSBEAT_FEATURES`). - // For A365-only / OTLP-only / Console-only customers we spin up our - // own MeterProvider + AzureMonitorStatsbeatExporter pipeline so the - // distro feature/instrumentation bits still reach the well-known - // statsbeat endpoint. - if (!azureMonitorEnabled) { - void SdkStatsManager.getInstance().initialize(); - } + // ── SDKStats: standalone pipeline ───────────────────────────────── + // The standalone pipeline ALWAYS runs so per-export network statsbeat + // (`request_*` gauges) for A365 / OTLP transmits is captured. + // + // - When Azure Monitor is enabled (`networkOnly: true`): only the + // network gauges are registered. The Feature / Feature.instrumentations + // long-interval statsbeat is owned by the AzMon exporter, with our + // distro bits bridged in via `setStatsbeatFeatures` → + // `AZURE_MONITOR_STATSBEAT_FEATURES`. Network statsbeat is safe to + // coexist because the `endpoint` attribute partitions the time series + // (AzMon ingestion hosts vs A365 / OTLP hosts). + // - When Azure Monitor is disabled: the standalone pipeline owns the + // full set (feature + instrumentation + network) and ships them to + // the well-known statsbeat endpoint. + void SdkStatsManager.getInstance().initialize({ networkOnly: azureMonitorEnabled }); // Initialize GenAI instrumentations after providers are registered so any // tracer they capture is backed by the active SDK provider. diff --git a/src/otlp/handler.ts b/src/otlp/handler.ts index 6645f62..f5ff166 100644 --- a/src/otlp/handler.ts +++ b/src/otlp/handler.ts @@ -5,12 +5,18 @@ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http"; import { OTLPMetricExporter } from "@opentelemetry/exporter-metrics-otlp-http"; import { OTLPLogExporter } from "@opentelemetry/exporter-logs-otlp-http"; import { BatchSpanProcessor } from "@opentelemetry/sdk-trace-base"; -import type { SpanProcessor } from "@opentelemetry/sdk-trace-base"; +import type { SpanExporter, SpanProcessor } from "@opentelemetry/sdk-trace-base"; import { PeriodicExportingMetricReader } from "@opentelemetry/sdk-metrics"; -import type { MetricReader } from "@opentelemetry/sdk-metrics"; +import type { MetricReader, PushMetricExporter } from "@opentelemetry/sdk-metrics"; import { BatchLogRecordProcessor } from "@opentelemetry/sdk-logs"; -import type { LogRecordProcessor } from "@opentelemetry/sdk-logs"; +import type { LogRecordExporter, LogRecordProcessor } from "@opentelemetry/sdk-logs"; import { Logger } from "../shared/logging/index.js"; +import { + NetworkStatsLogExporter, + NetworkStatsMetricExporter, + NetworkStatsSpanExporter, + isSdkStatsEnabled, +} from "../sdkstats/index.js"; const OTEL_EXPORTER_OTLP_ENDPOINT = "OTEL_EXPORTER_OTLP_ENDPOINT"; const OTEL_EXPORTER_OTLP_TRACES_ENDPOINT = "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"; @@ -125,12 +131,19 @@ export function createOtlpComponents(): OtlpComponents { const components: OtlpComponents = {}; Logger.getInstance().info("OTLP export enabled for traces, metrics, and logs."); + // When SDKStats is enabled, decorate each OTLP exporter with the + // matching NetworkStats* wrapper so per-export success / failure / + // exception / duration counters reach the standalone SDKStats pipeline. + const recordNetworkStats = isSdkStatsEnabled(); // Trace exporter — reads OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT, // OTEL_EXPORTER_OTLP_HEADERS, OTEL_EXPORTER_OTLP_TRACES_HEADERS, // OTEL_EXPORTER_OTLP_TIMEOUT, OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, // OTEL_EXPORTER_OTLP_COMPRESSION, OTEL_EXPORTER_OTLP_TRACES_COMPRESSION - const traceExporter = new OTLPTraceExporter(); + let traceExporter: SpanExporter = new OTLPTraceExporter(); + if (recordNetworkStats) { + traceExporter = new NetworkStatsSpanExporter(traceExporter); + } components.spanProcessor = new BatchSpanProcessor(traceExporter); // Metric exporter — reads OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_METRICS_ENDPOINT, @@ -139,7 +152,10 @@ export function createOtlpComponents(): OtlpComponents { // OTEL_EXPORTER_OTLP_COMPRESSION, OTEL_EXPORTER_OTLP_METRICS_COMPRESSION, // OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE, // OTEL_EXPORTER_OTLP_METRICS_DEFAULT_HISTOGRAM_AGGREGATION - const metricExporter = new OTLPMetricExporter(); + let metricExporter: PushMetricExporter = new OTLPMetricExporter(); + if (recordNetworkStats) { + metricExporter = new NetworkStatsMetricExporter(metricExporter); + } components.metricReader = new PeriodicExportingMetricReader({ exporter: metricExporter, }); @@ -148,7 +164,10 @@ export function createOtlpComponents(): OtlpComponents { // OTEL_EXPORTER_OTLP_HEADERS, OTEL_EXPORTER_OTLP_LOGS_HEADERS, // OTEL_EXPORTER_OTLP_TIMEOUT, OTEL_EXPORTER_OTLP_LOGS_TIMEOUT, // OTEL_EXPORTER_OTLP_COMPRESSION, OTEL_EXPORTER_OTLP_LOGS_COMPRESSION - const logExporter = new OTLPLogExporter(); + let logExporter: LogRecordExporter = new OTLPLogExporter(); + if (recordNetworkStats) { + logExporter = new NetworkStatsLogExporter(logExporter); + } components.logRecordProcessor = new BatchLogRecordProcessor(logExporter); return components; diff --git a/src/sdkstats/index.ts b/src/sdkstats/index.ts index 674481e..a1befbe 100644 --- a/src/sdkstats/index.ts +++ b/src/sdkstats/index.ts @@ -30,5 +30,32 @@ export { } from "./state.js"; export { SdkStatsMetrics, FEATURE_TYPE_FEATURE, FEATURE_TYPE_INSTRUMENTATION } from "./metrics.js"; +export type { SdkStatsMetricsOptions } from "./metrics.js"; export { SdkStatsManager } from "./manager.js"; + +export { + THROTTLE_STATUS_CODES, + REQUEST_SUCCESS_NAME, + REQUEST_FAILURE_NAME, + REQUEST_RETRY_NAME, + REQUEST_THROTTLE_NAME, + REQUEST_EXCEPTION_NAME, + REQUEST_DURATION_NAME, + NETWORK_METRIC_NAMES, + recordSuccess, + recordFailure, + recordRetry, + recordThrottle, + recordException, + recordDuration, + drain, + _resetAllForTest as _resetNetworkStatsForTest, +} from "./networkStats.js"; +export type { NetworkMetricName, NetworkKey } from "./networkStats.js"; + +export { + NetworkStatsSpanExporter, + NetworkStatsMetricExporter, + NetworkStatsLogExporter, +} from "./otlpWrapper.js"; diff --git a/src/sdkstats/manager.ts b/src/sdkstats/manager.ts index b0417d2..e44a5da 100644 --- a/src/sdkstats/manager.ts +++ b/src/sdkstats/manager.ts @@ -83,11 +83,18 @@ export class SdkStatsManager { /** * Set up SDKStats export via the Azure Monitor statsbeat endpoint. * + * @param options.networkOnly When `true`, the {@link SdkStatsMetrics} + * instance only registers the six network gauges and skips the + * feature/instrumentation gauges. Used on the Azure-Monitor-enabled + * path because the AzMon exporter's own long-interval statsbeat + * already emits those gauges (with our distro bits bridged in via + * `AZURE_MONITOR_STATSBEAT_FEATURES`). + * * Returns `true` if the standalone pipeline was initialized (or was * already initialized), `false` if SDKStats are disabled via env var * or initialization failed. */ - async initialize(): Promise { + async initialize(options: { networkOnly?: boolean } = {}): Promise { if (!isSdkStatsEnabled()) { return false; } @@ -134,7 +141,9 @@ export class SdkStatsManager { readers: [reader], resource: resourceFromAttributes({}), }); - this._metrics = new SdkStatsMetrics(this._meterProvider); + this._metrics = new SdkStatsMetrics(this._meterProvider, { + networkOnly: options.networkOnly, + }); this._initialized = true; setSdkStatsShutdown(false); diff --git a/src/sdkstats/metrics.ts b/src/sdkstats/metrics.ts index 909ac9f..537795a 100644 --- a/src/sdkstats/metrics.ts +++ b/src/sdkstats/metrics.ts @@ -18,6 +18,17 @@ import type { ObservableResult } from "@opentelemetry/api"; import { MICROSOFT_OPENTELEMETRY_VERSION } from "../types.js"; import { getSdkStatsFeatureFlags, getSdkStatsInstrumentationFlags } from "./state.js"; +import { + NETWORK_METRIC_NAMES, + REQUEST_DURATION_NAME, + REQUEST_EXCEPTION_NAME, + REQUEST_FAILURE_NAME, + REQUEST_RETRY_NAME, + REQUEST_SUCCESS_NAME, + REQUEST_THROTTLE_NAME, + drain, + type NetworkMetricName, +} from "./networkStats.js"; /** * Feature SDKStats `type` dimension values, per the Application Insights @@ -32,14 +43,95 @@ const FEATURE_METRIC_NAME = "Feature"; const INSTRUMENTATION_METRIC_NAME = "Feature.instrumentations"; const STATSBEAT_LANGUAGE = "node"; +/** + * Per-metric configuration for the six network statsbeat gauges. + * + * - `secondAttr` — name of the additional dimension (`statusCode` or + * `exceptionType`) reported alongside `endpoint`. `undefined` means the + * metric is keyed on `endpoint` only. + */ +interface NetworkGaugeSpec { + metric: NetworkMetricName; + secondAttr?: "statusCode" | "exceptionType"; + unit: string; + description: string; +} + +const NETWORK_GAUGE_SPECS: readonly NetworkGaugeSpec[] = [ + { + metric: REQUEST_SUCCESS_NAME, + unit: "count", + description: "Number of successful HTTP exports per endpoint", + }, + { + metric: REQUEST_FAILURE_NAME, + secondAttr: "statusCode", + unit: "count", + description: "Number of failed HTTP exports per endpoint and status code", + }, + { + metric: REQUEST_RETRY_NAME, + secondAttr: "statusCode", + unit: "count", + description: "Number of retried HTTP exports per endpoint and status code", + }, + { + metric: REQUEST_THROTTLE_NAME, + secondAttr: "statusCode", + unit: "count", + description: "Number of throttled HTTP exports per endpoint and status code", + }, + { + metric: REQUEST_EXCEPTION_NAME, + secondAttr: "exceptionType", + unit: "count", + description: "Number of HTTP exports that raised an exception, per endpoint and exception type", + }, + { + metric: REQUEST_DURATION_NAME, + unit: "s", + description: "Cumulative HTTP export duration per endpoint", + }, +]; + +// Sanity check at module load — keeps NETWORK_GAUGE_SPECS in sync with +// NETWORK_METRIC_NAMES if either is edited. +/* istanbul ignore next */ +if (NETWORK_GAUGE_SPECS.length !== NETWORK_METRIC_NAMES.length) { + throw new Error("NETWORK_GAUGE_SPECS is out of sync with NETWORK_METRIC_NAMES"); +} + +/** + * Options for {@link SdkStatsMetrics}. + */ +export interface SdkStatsMetricsOptions { + /** Override the distro version reported on every observation. */ + distroVersion?: string; + /** + * When `true`, skip the Feature / Feature.instrumentations gauges. Used + * on the Azure-Monitor-enabled path because the AzMon exporter's own + * long-interval statsbeat already emits those gauges (with our distro + * bits bridged in via `AZURE_MONITOR_STATSBEAT_FEATURES`); registering + * them here would double-count. + * + * The six network statsbeat gauges (`request_*`) are always registered + * regardless of this flag — coexistence with AzMon's own network + * statsbeat is safe because the `endpoint` attribute partitions the + * series by destination host. + */ + networkOnly?: boolean; +} + /** * Registers observable gauges that emit feature/instrumentation data - * derived from the global SDKStats state. + * derived from the global SDKStats state, plus per-export network + * statsbeat counters drained from {@link ./networkStats.js}. */ export class SdkStatsMetrics { private readonly commonAttributes: Record; - constructor(meterProvider: MeterProvider, distroVersion?: string) { + constructor(meterProvider: MeterProvider, options: SdkStatsMetricsOptions = {}) { + const { distroVersion, networkOnly = false } = options; const meter = meterProvider.getMeter("microsoft.opentelemetry.sdkstats"); this.commonAttributes = { @@ -49,15 +141,32 @@ export class SdkStatsMetrics { version: distroVersion || MICROSOFT_OPENTELEMETRY_VERSION, }; - const featureGauge = meter.createObservableGauge(FEATURE_METRIC_NAME, { - description: "SDKStats metric tracking enabled features", - }); - featureGauge.addCallback(this.observeFeatures); + // Feature / instrumentation bitmask gauges are skipped when running + // alongside the Azure Monitor exporter's own statsbeat — that pipeline + // already emits them (with our distro bits bridged in via + // `_bridge_sdkstats_to_azure_monitor`) and would collide with these. + if (!networkOnly) { + const featureGauge = meter.createObservableGauge(FEATURE_METRIC_NAME, { + description: "SDKStats metric tracking enabled features", + }); + featureGauge.addCallback(this.observeFeatures); - const instrumentationGauge = meter.createObservableGauge(INSTRUMENTATION_METRIC_NAME, { - description: "SDKStats metric tracking enabled instrumentations", - }); - instrumentationGauge.addCallback(this.observeInstrumentations); + const instrumentationGauge = meter.createObservableGauge(INSTRUMENTATION_METRIC_NAME, { + description: "SDKStats metric tracking enabled instrumentations", + }); + instrumentationGauge.addCallback(this.observeInstrumentations); + } + + // Network statsbeat gauges — always registered. Each callback drains + // the counts accumulated by exporters between observations and emits + // one Observation per (endpoint[, second-attr]) tuple. + for (const spec of NETWORK_GAUGE_SPECS) { + const gauge = meter.createObservableGauge(spec.metric, { + unit: spec.unit, + description: spec.description, + }); + gauge.addCallback(this.makeNetworkCallback(spec)); + } } private observeFeatures = (result: ObservableResult): void => { @@ -87,4 +196,20 @@ export class SdkStatsMetrics { type: FEATURE_TYPE_INSTRUMENTATION, }); }; + + private makeNetworkCallback(spec: NetworkGaugeSpec): (result: ObservableResult) => void { + return (result: ObservableResult): void => { + for (const [key, value] of drain(spec.metric)) { + const attrs: Record = { + ...this.commonAttributes, + endpoint: key[0], + }; + if (spec.secondAttr && key.length === 2) { + attrs[spec.secondAttr] = key[1]; + } + result.observe(value, attrs); + } + }; + } } + diff --git a/src/sdkstats/networkStats.ts b/src/sdkstats/networkStats.ts new file mode 100644 index 0000000..28101ef --- /dev/null +++ b/src/sdkstats/networkStats.ts @@ -0,0 +1,137 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +/** + * Network statsbeat accumulator for SDK self-telemetry. + * + * Per-export success / failure / retry / throttle / exception counts and + * cumulative request duration for telemetry exporters. Exporters call the + * `record*` functions after each transmit; the {@link SdkStatsMetrics} + * observable-gauge callbacks drain the accumulated counts on each export + * interval. + * + * Mirrors `src/microsoft/opentelemetry/_sdkstats/_utils.py` from the Python + * distro (microsoft/opentelemetry-distro-python#144). + */ + +/** + * HTTP status codes treated as throttling for SDKStats purposes. + * + * @internal + */ +export const THROTTLE_STATUS_CODES: ReadonlySet = new Set([402]); + +export const REQUEST_SUCCESS_NAME = "request_success_count"; +export const REQUEST_FAILURE_NAME = "request_failure_count"; +export const REQUEST_RETRY_NAME = "request_retry_count"; +export const REQUEST_THROTTLE_NAME = "request_throttle_count"; +export const REQUEST_EXCEPTION_NAME = "request_exception_count"; +export const REQUEST_DURATION_NAME = "request_duration"; + +/** + * Names of all six network statsbeat metrics, in registration order. + * + * @internal + */ +export const NETWORK_METRIC_NAMES = [ + REQUEST_SUCCESS_NAME, + REQUEST_FAILURE_NAME, + REQUEST_RETRY_NAME, + REQUEST_THROTTLE_NAME, + REQUEST_EXCEPTION_NAME, + REQUEST_DURATION_NAME, +] as const; + +export type NetworkMetricName = (typeof NETWORK_METRIC_NAMES)[number]; + +/** + * Composite key for an aggregated network statsbeat counter. + * + * - Single-element tuples key on `endpoint` only (success / duration). + * - Two-element tuples key on `[endpoint, statusCode | exceptionType]` + * (failure / retry / throttle / exception). + * + * @internal + */ +export type NetworkKey = readonly [string] | readonly [string, string]; + +// Single-threaded JS execution → no lock needed (Python uses one because of +// the GIL + threads; Node.js doesn't share JS objects across worker threads). +const REQUESTS_MAP: Record> = { + [REQUEST_SUCCESS_NAME]: new Map(), + [REQUEST_FAILURE_NAME]: new Map(), + [REQUEST_RETRY_NAME]: new Map(), + [REQUEST_THROTTLE_NAME]: new Map(), + [REQUEST_EXCEPTION_NAME]: new Map(), + [REQUEST_DURATION_NAME]: new Map(), +}; + +// `Map` keys are compared by identity for arrays/objects, so we serialize +// the key tuple to a string. The `\u0000` separator can't appear in a URL +// hostname or HTTP status string, so this is unambiguous. +const KEY_SEPARATOR = "\u0000"; + +function encodeKey(key: NetworkKey): string { + return key.length === 1 ? key[0] : `${key[0]}${KEY_SEPARATOR}${key[1]}`; +} + +function decodeKey(encoded: string): NetworkKey { + const sep = encoded.indexOf(KEY_SEPARATOR); + if (sep < 0) return [encoded] as const; + return [encoded.slice(0, sep), encoded.slice(sep + 1)] as const; +} + +function bump(metric: NetworkMetricName, key: NetworkKey, value = 1): void { + const bucket = REQUESTS_MAP[metric]; + const encoded = encodeKey(key); + bucket.set(encoded, (bucket.get(encoded) ?? 0) + value); +} + +export function recordSuccess(endpoint: string): void { + bump(REQUEST_SUCCESS_NAME, [endpoint]); +} + +export function recordFailure(endpoint: string, statusCode: number | string): void { + bump(REQUEST_FAILURE_NAME, [endpoint, String(statusCode)]); +} + +export function recordRetry(endpoint: string, statusCode: number | string): void { + bump(REQUEST_RETRY_NAME, [endpoint, String(statusCode)]); +} + +export function recordThrottle(endpoint: string, statusCode: number | string = 402): void { + bump(REQUEST_THROTTLE_NAME, [endpoint, String(statusCode)]); +} + +export function recordException(endpoint: string, exceptionType: string): void { + bump(REQUEST_EXCEPTION_NAME, [endpoint, exceptionType]); +} + +export function recordDuration(endpoint: string, durationSeconds: number): void { + bump(REQUEST_DURATION_NAME, [endpoint], durationSeconds); +} + +/** + * Atomically return and reset the counts for `metric`. + * + * Used by the observable-gauge callbacks so each observation reports only + * the delta accumulated during the export interval. + */ +export function drain(metric: NetworkMetricName): Map { + const bucket = REQUESTS_MAP[metric]; + const snapshot = new Map(); + for (const [encoded, value] of bucket) { + snapshot.set(decodeKey(encoded), value); + } + bucket.clear(); + return snapshot; +} + +/** + * @internal Test-only: clear all network statsbeat counters. + */ +export function _resetAllForTest(): void { + for (const name of NETWORK_METRIC_NAMES) { + REQUESTS_MAP[name].clear(); + } +} diff --git a/src/sdkstats/otlpWrapper.ts b/src/sdkstats/otlpWrapper.ts new file mode 100644 index 0000000..1980d51 --- /dev/null +++ b/src/sdkstats/otlpWrapper.ts @@ -0,0 +1,202 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +/** + * Network statsbeat wrappers for OTLP exporters. + * + * The upstream OTLP HTTP exporters do not surface HTTP status codes — only + * the {@link ExportResult} enum and any raised exception. The decorators + * here capture that signal so the network statsbeat pipeline can record + * success / failure / exception / duration counts per endpoint. + * + * Mirrors `src/microsoft/opentelemetry/_sdkstats/_otlp_wrapper.py` from the + * Python distro (microsoft/opentelemetry-distro-python#144). + */ + +import type { ExportResult } from "@opentelemetry/core"; +import { ExportResultCode } from "@opentelemetry/core"; +import type { + AggregationTemporality, + AggregationOption, + InstrumentType, + PushMetricExporter, + ResourceMetrics, +} from "@opentelemetry/sdk-metrics"; +import type { ReadableSpan, SpanExporter } from "@opentelemetry/sdk-trace-base"; +import type { LogRecordExporter, ReadableLogRecord } from "@opentelemetry/sdk-logs"; + +import { + recordDuration, + recordException, + recordFailure, + recordSuccess, +} from "./networkStats.js"; + +/** + * Resolve the destination hostname for a given OTLP signal. + * + * The OTel HTTP exporters do not expose their endpoint on a stable public + * field, so we read the same env-var precedence the exporters themselves + * use ({@link https://opentelemetry.io/docs/specs/otel/protocol/exporter/}). + * Falls back to `"unknown"` when no endpoint can be resolved (e.g. fully + * programmatic config without env vars). + */ +function resolveEndpointHost(signal: "traces" | "metrics" | "logs"): string { + const signalSpecific = + signal === "traces" + ? "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" + : signal === "metrics" + ? "OTEL_EXPORTER_OTLP_METRICS_ENDPOINT" + : "OTEL_EXPORTER_OTLP_LOGS_ENDPOINT"; + + const raw = process.env[signalSpecific] ?? process.env.OTEL_EXPORTER_OTLP_ENDPOINT; + if (!raw) return "unknown"; + + try { + return new URL(raw).hostname || raw; + } catch { + return raw; + } +} + +/** + * Common bookkeeping for an export attempt. + * + * The OTel JS exporter contract is callback-based, not promise-based, and + * the HTTP exporters surface no status code — only an {@link ExportResult}. + * On `ExportResultCode.SUCCESS` we record a success; otherwise we record + * failure with a placeholder `statusCode=0` (matching the Python distro). + * Synchronous throws and async-completed errors are both recorded as + * exceptions keyed by the error class name. + */ +function wrapExport( + endpoint: string, + inner: (resultCallback: (result: ExportResult) => void) => void, + resultCallback: (result: ExportResult) => void, + _items: T, +): void { + const start = Date.now(); + let settled = false; + const settle = (result: ExportResult): void => { + if (settled) return; + settled = true; + recordDuration(endpoint, (Date.now() - start) / 1000); + if (result.code === ExportResultCode.SUCCESS) { + recordSuccess(endpoint); + } else { + // The HTTP exporters don't expose an HTTP status code, so record + // failures with statusCode=0 (matches Python distro). + recordFailure(endpoint, 0); + } + resultCallback(result); + }; + + try { + inner(settle); + } catch (err) { + settled = true; + recordDuration(endpoint, (Date.now() - start) / 1000); + recordException(endpoint, errorName(err)); + throw err; + } +} + +function errorName(err: unknown): string { + if (err instanceof Error) { + return err.name || err.constructor.name || "Error"; + } + return typeof err; +} + +/** + * Span exporter decorator that records network statsbeat counts. + */ +export class NetworkStatsSpanExporter implements SpanExporter { + private readonly endpoint: string; + + constructor(private readonly inner: SpanExporter) { + this.endpoint = resolveEndpointHost("traces"); + } + + export(spans: ReadableSpan[], resultCallback: (result: ExportResult) => void): void { + wrapExport( + this.endpoint, + (cb) => this.inner.export(spans, cb), + resultCallback, + spans, + ); + } + + shutdown(): Promise { + return this.inner.shutdown(); + } + + forceFlush(): Promise { + return this.inner.forceFlush?.() ?? Promise.resolve(); + } +} + +/** + * Metric exporter decorator that records network statsbeat counts. + * + * `selectAggregationTemporality` / `selectAggregation` are forwarded only + * when the inner exporter defines them — preserving its preferences while + * keeping our wrapper transparent to the SDK's default-aggregation logic + * for exporters that don't. + */ +export class NetworkStatsMetricExporter implements PushMetricExporter { + private readonly endpoint: string; + selectAggregationTemporality?: (instrumentType: InstrumentType) => AggregationTemporality; + selectAggregation?: (instrumentType: InstrumentType) => AggregationOption; + + constructor(private readonly inner: PushMetricExporter) { + this.endpoint = resolveEndpointHost("metrics"); + if (inner.selectAggregationTemporality) { + this.selectAggregationTemporality = (t) => inner.selectAggregationTemporality!(t); + } + if (inner.selectAggregation) { + this.selectAggregation = (t) => inner.selectAggregation!(t); + } + } + + export(metrics: ResourceMetrics, resultCallback: (result: ExportResult) => void): void { + wrapExport( + this.endpoint, + (cb) => this.inner.export(metrics, cb), + resultCallback, + metrics, + ); + } + + forceFlush(): Promise { + return this.inner.forceFlush(); + } + + shutdown(): Promise { + return this.inner.shutdown(); + } +} + +/** + * Log exporter decorator that records network statsbeat counts. + */ +export class NetworkStatsLogExporter implements LogRecordExporter { + private readonly endpoint: string; + + constructor(private readonly inner: LogRecordExporter) { + this.endpoint = resolveEndpointHost("logs"); + } + + export(logs: ReadableLogRecord[], resultCallback: (result: ExportResult) => void): void { + wrapExport( + this.endpoint, + (cb) => this.inner.export(logs, cb), + resultCallback, + logs, + ); + } + + shutdown(): Promise { + return this.inner.shutdown(); + } +} diff --git a/test/internal/unit/a365/agent365NetworkStats.test.ts b/test/internal/unit/a365/agent365NetworkStats.test.ts new file mode 100644 index 0000000..4f31e1b --- /dev/null +++ b/test/internal/unit/a365/agent365NetworkStats.test.ts @@ -0,0 +1,177 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { SpanKind, SpanStatusCode, TraceFlags } from "@opentelemetry/api"; +import type { ReadableSpan } from "@opentelemetry/sdk-trace-base"; + +import { Agent365Exporter } from "../../../../src/a365/exporter/Agent365Exporter.js"; +import { + REQUEST_DURATION_NAME, + REQUEST_EXCEPTION_NAME, + REQUEST_FAILURE_NAME, + REQUEST_RETRY_NAME, + REQUEST_SUCCESS_NAME, + REQUEST_THROTTLE_NAME, + _resetAllForTest, + drain, +} from "../../../../src/sdkstats/networkStats.js"; +import { _resetA365LoggerForTest } from "../../../../src/a365/logging.js"; + +const TENANT_ID = "tenant-11111111-1111-1111-1111-111111111111"; +const AGENT_ID = "agent-22222222-2222-2222-2222-222222222222"; + +function makeSpan(): ReadableSpan { + return { + name: "test-span", + kind: SpanKind.INTERNAL, + spanContext: () => ({ + traceId: "aaaabbbbccccddddeeee111122223333", + spanId: "1111222233334444", + traceFlags: TraceFlags.SAMPLED, + }), + parentSpanContext: undefined, + startTime: [1700000000, 0], + endTime: [1700000001, 0], + status: { code: SpanStatusCode.OK }, + attributes: { + "microsoft.tenant.id": TENANT_ID, + "gen_ai.agent.id": AGENT_ID, + "gen_ai.operation.name": "invoke_agent", + }, + events: [], + links: [], + resource: { attributes: {} }, + instrumentationScope: { name: "test-scope", version: "1.0.0" }, + instrumentationLibrary: { name: "test-scope", version: "1.0.0" }, + duration: [1, 0], + ended: true, + droppedAttributesCount: 0, + droppedEventsCount: 0, + droppedLinksCount: 0, + } as unknown as ReadableSpan; +} + +function exportSpan(exporter: Agent365Exporter): Promise { + return new Promise((resolve) => exporter.export([makeSpan()], (r) => resolve(r.code))); +} + +function fetchHost(): string { + // Whatever URL `Agent365Exporter` POSTs to in the default config — we + // pluck it from the captured fetch args so the test never has to know + // the Agent365 endpoint resolution rules. + const calls = (globalThis.fetch as unknown as { mock?: { calls: unknown[][] } }).mock?.calls ?? []; + if (calls.length === 0) return "unknown"; + const url = calls[0][0] as string; + try { + return new URL(url).hostname; + } catch { + return url; + } +} + +describe("Agent365Exporter network statsbeat", () => { + let fetchSpy: ReturnType; + + beforeEach(() => { + delete process.env.MICROSOFT_OTEL_SDKSTATS_DISABLED; + delete process.env.APPLICATIONINSIGHTS_STATSBEAT_DISABLED_ALL; + _resetAllForTest(); + fetchSpy = vi.fn(); + vi.stubGlobal("fetch", fetchSpy); + }); + + afterEach(() => { + _resetAllForTest(); + _resetA365LoggerForTest(); + vi.restoreAllMocks(); + }); + + it("records request_success_count + request_duration on a 2xx response", async () => { + fetchSpy.mockResolvedValue({ + status: 200, + headers: new Map([["x-ms-correlation-id", "c1"]]), + }); + + const exporter = new Agent365Exporter({ tokenResolver: () => "tok" }); + await exportSpan(exporter); + + const host = fetchHost(); + expect([...drain(REQUEST_SUCCESS_NAME).entries()]).toEqual([[[host], 1]]); + const dur = drain(REQUEST_DURATION_NAME); + expect([...dur.keys()][0]).toEqual([host]); + expect((dur.get([host]) ?? [...dur.values()][0])).toBeGreaterThanOrEqual(0); + }); + + it("records request_retry_count for every retryable response and a final request_failure_count when retries are exhausted", async () => { + fetchSpy.mockResolvedValue({ status: 503, headers: new Map() }); + // Speed up retries — postWithRetries does 1 initial + 3 retries = 4 attempts. + vi.spyOn(globalThis, "setTimeout").mockImplementation(((cb: () => void) => { + cb(); + return 0 as unknown as NodeJS.Timeout; + }) as typeof setTimeout); + + const exporter = new Agent365Exporter({ tokenResolver: () => "tok" }); + await exportSpan(exporter); + + const host = fetchHost(); + const retries = drain(REQUEST_RETRY_NAME); + expect([...retries.entries()]).toEqual([[[host, "503"], 4]]); + const failures = drain(REQUEST_FAILURE_NAME); + expect([...failures.entries()]).toEqual([[[host, "503"], 1]]); + }); + + it("records request_failure_count for non-retryable, non-throttle status codes", async () => { + fetchSpy.mockResolvedValue({ status: 404, headers: new Map() }); + const exporter = new Agent365Exporter({ tokenResolver: () => "tok" }); + await exportSpan(exporter); + + const host = fetchHost(); + expect([...drain(REQUEST_FAILURE_NAME).entries()]).toEqual([[[host, "404"], 1]]); + expect(drain(REQUEST_RETRY_NAME).size).toBe(0); + }); + + it("records request_throttle_count on HTTP 402", async () => { + fetchSpy.mockResolvedValue({ status: 402, headers: new Map() }); + const exporter = new Agent365Exporter({ tokenResolver: () => "tok" }); + await exportSpan(exporter); + + const host = fetchHost(); + expect([...drain(REQUEST_THROTTLE_NAME).entries()]).toEqual([[[host, "402"], 1]]); + expect(drain(REQUEST_FAILURE_NAME).size).toBe(0); + }); + + it("records request_exception_count + duration when fetch rejects, on every retry", async () => { + class AbortError extends Error { + override name = "AbortError"; + } + fetchSpy.mockRejectedValue(new AbortError("aborted")); + vi.spyOn(globalThis, "setTimeout").mockImplementation(((cb: () => void) => { + cb(); + return 0 as unknown as NodeJS.Timeout; + }) as typeof setTimeout); + + const exporter = new Agent365Exporter({ tokenResolver: () => "tok" }); + await exportSpan(exporter); + + const host = fetchHost(); + const exceptions = drain(REQUEST_EXCEPTION_NAME); + expect([...exceptions.entries()]).toEqual([[[host, "AbortError"], 4]]); + const durations = drain(REQUEST_DURATION_NAME); + expect([...durations.keys()][0]).toEqual([host]); + }); + + it("records nothing when MICROSOFT_OTEL_SDKSTATS_DISABLED=true", async () => { + process.env.MICROSOFT_OTEL_SDKSTATS_DISABLED = "true"; + fetchSpy.mockResolvedValue({ + status: 200, + headers: new Map(), + }); + + const exporter = new Agent365Exporter({ tokenResolver: () => "tok" }); + await exportSpan(exporter); + + expect(drain(REQUEST_SUCCESS_NAME).size).toBe(0); + expect(drain(REQUEST_DURATION_NAME).size).toBe(0); + }); +}); diff --git a/test/internal/unit/otlp/handler.test.ts b/test/internal/unit/otlp/handler.test.ts index 42d88fb..30a8946 100644 --- a/test/internal/unit/otlp/handler.test.ts +++ b/test/internal/unit/otlp/handler.test.ts @@ -68,5 +68,33 @@ describe("OTLP Handler", () => { expect(components.metricReader).toBeInstanceOf(PeriodicExportingMetricReader); expect(components.logRecordProcessor).toBeInstanceOf(BatchLogRecordProcessor); }); + + describe("network SDKStats wiring", () => { + it("wraps each exporter with the NetworkStats* decorator when SDKStats is enabled", () => { + delete process.env["MICROSOFT_OTEL_SDKSTATS_DISABLED"]; + delete process.env["APPLICATIONINSIGHTS_STATSBEAT_DISABLED_ALL"]; + const components = createOtlpComponents(); + + const spanInner = (components.spanProcessor as unknown as { _exporter: unknown })._exporter; + const metricInner = ( + components.metricReader as unknown as { _exporter: unknown } + )._exporter; + const logInner = ( + components.logRecordProcessor as unknown as { _exporter: unknown } + )._exporter; + + expect(spanInner?.constructor.name).toBe("NetworkStatsSpanExporter"); + expect(metricInner?.constructor.name).toBe("NetworkStatsMetricExporter"); + expect(logInner?.constructor.name).toBe("NetworkStatsLogExporter"); + }); + + it("does NOT wrap exporters when MICROSOFT_OTEL_SDKSTATS_DISABLED=true", () => { + process.env["MICROSOFT_OTEL_SDKSTATS_DISABLED"] = "true"; + const components = createOtlpComponents(); + + const spanInner = (components.spanProcessor as unknown as { _exporter: unknown })._exporter; + expect(spanInner?.constructor.name).not.toBe("NetworkStatsSpanExporter"); + }); + }); }); }); diff --git a/test/internal/unit/sdkstats/metrics.test.ts b/test/internal/unit/sdkstats/metrics.test.ts index b2adf3c..663b865 100644 --- a/test/internal/unit/sdkstats/metrics.test.ts +++ b/test/internal/unit/sdkstats/metrics.test.ts @@ -4,6 +4,21 @@ import { describe, it, beforeEach, expect } from "vitest"; import { MeterProvider } from "@opentelemetry/sdk-metrics"; +import { + REQUEST_DURATION_NAME, + REQUEST_EXCEPTION_NAME, + REQUEST_FAILURE_NAME, + REQUEST_RETRY_NAME, + REQUEST_SUCCESS_NAME, + REQUEST_THROTTLE_NAME, + _resetAllForTest as _resetNetworkStatsForTest, + recordDuration, + recordException, + recordFailure, + recordRetry, + recordSuccess, + recordThrottle, +} from "../../../../src/sdkstats/networkStats.js"; import { FEATURE_TYPE_FEATURE, FEATURE_TYPE_INSTRUMENTATION, @@ -140,7 +155,7 @@ describe("sdkstats/metrics", () => { exportIntervalMillis: 60_000, }); const meterProvider = new MeterProvider({ readers: [reader] }); - new SdkStatsMetrics(meterProvider, "9.9.9-test"); + new SdkStatsMetrics(meterProvider, { distroVersion: "9.9.9-test" }); await meterProvider.forceFlush(); @@ -152,4 +167,108 @@ describe("sdkstats/metrics", () => { await meterProvider.shutdown(); }); + + describe("networkOnly mode", () => { + it("skips Feature/Feature.instrumentations gauges but still registers network gauges", async () => { + // Set bits that would normally trigger feature/instrumentation observations. + setSdkStatsFeature(StatsbeatFeature.DISTRO); + setSdkStatsInstrumentation(StatsbeatInstrumentation.MONGODB); + // Drop a network counter so a request_success_count observation will fire. + _resetNetworkStatsForTest(); + recordSuccess("contoso.example.com"); + + const { PeriodicExportingMetricReader } = await import("@opentelemetry/sdk-metrics"); + const exporter = new InMemoryMetricExporter(AggregationTemporality.CUMULATIVE); + const reader = new PeriodicExportingMetricReader({ + exporter, + exportIntervalMillis: 60_000, + }); + const meterProvider = new MeterProvider({ readers: [reader] }); + new SdkStatsMetrics(meterProvider, { networkOnly: true }); + + await meterProvider.forceFlush(); + + const names = exporter + .getMetrics() + .flatMap((rm) => rm.scopeMetrics.flatMap((sm) => sm.metrics)) + .map((m) => m.descriptor.name); + + expect(names).not.toContain("Feature"); + expect(names).not.toContain("Feature.instrumentations"); + expect(names).toContain(REQUEST_SUCCESS_NAME); + + await meterProvider.shutdown(); + _resetNetworkStatsForTest(); + }); + }); + + describe("network gauges (default mode)", () => { + it("emits one observation per drained key, attaches endpoint + statusCode/exceptionType, and clears after collection", async () => { + _resetNetworkStatsForTest(); + recordSuccess("a365.example.com"); + recordSuccess("a365.example.com"); + recordFailure("a365.example.com", 503); + recordRetry("a365.example.com", 503); + recordRetry("a365.example.com", 503); + recordThrottle("otlp.example.com", 402); + recordException("otlp.example.com", "AbortError"); + recordDuration("a365.example.com", 1.25); + + const { PeriodicExportingMetricReader } = await import("@opentelemetry/sdk-metrics"); + const exporter = new InMemoryMetricExporter(AggregationTemporality.CUMULATIVE); + const reader = new PeriodicExportingMetricReader({ + exporter, + exportIntervalMillis: 60_000, + }); + const meterProvider = new MeterProvider({ readers: [reader] }); + new SdkStatsMetrics(meterProvider); + + await meterProvider.forceFlush(); + + const byName = (name: string) => + exporter + .getMetrics() + .flatMap((rm) => rm.scopeMetrics.flatMap((sm) => sm.metrics)) + .filter((m) => m.descriptor.name === name) + .flatMap((m) => m.dataPoints); + + const success = byName(REQUEST_SUCCESS_NAME); + expect(success).toHaveLength(1); + expect(success[0].value).toBe(2); + expect(success[0].attributes.endpoint).toBe("a365.example.com"); + expect(success[0].attributes.statusCode).toBeUndefined(); + + const failure = byName(REQUEST_FAILURE_NAME); + expect(failure).toHaveLength(1); + expect(failure[0].value).toBe(1); + expect(failure[0].attributes.endpoint).toBe("a365.example.com"); + expect(failure[0].attributes.statusCode).toBe("503"); + + const retry = byName(REQUEST_RETRY_NAME); + expect(retry).toHaveLength(1); + expect(retry[0].value).toBe(2); + expect(retry[0].attributes.statusCode).toBe("503"); + + const throttle = byName(REQUEST_THROTTLE_NAME); + expect(throttle).toHaveLength(1); + expect(throttle[0].attributes.endpoint).toBe("otlp.example.com"); + expect(throttle[0].attributes.statusCode).toBe("402"); + + const exception = byName(REQUEST_EXCEPTION_NAME); + expect(exception).toHaveLength(1); + expect(exception[0].attributes.exceptionType).toBe("AbortError"); + + const duration = byName(REQUEST_DURATION_NAME); + expect(duration).toHaveLength(1); + expect(duration[0].value).toBeCloseTo(1.25); + + // Second flush after a reset & drain: drain semantics are covered + // in networkStats.test.ts; observable gauges may legitimately repeat + // their last value under CUMULATIVE aggregation depending on the + // SDK's caching, so we don't assert "empty" here. + + await meterProvider.shutdown(); + _resetNetworkStatsForTest(); + }); + }); }); diff --git a/test/internal/unit/sdkstats/networkStats.test.ts b/test/internal/unit/sdkstats/networkStats.test.ts new file mode 100644 index 0000000..cf10831 --- /dev/null +++ b/test/internal/unit/sdkstats/networkStats.test.ts @@ -0,0 +1,118 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { beforeEach, describe, expect, it } from "vitest"; + +import { + NETWORK_METRIC_NAMES, + REQUEST_DURATION_NAME, + REQUEST_EXCEPTION_NAME, + REQUEST_FAILURE_NAME, + REQUEST_RETRY_NAME, + REQUEST_SUCCESS_NAME, + REQUEST_THROTTLE_NAME, + THROTTLE_STATUS_CODES, + _resetAllForTest, + drain, + recordDuration, + recordException, + recordFailure, + recordRetry, + recordSuccess, + recordThrottle, +} from "../../../../src/sdkstats/networkStats.js"; + +describe("sdkstats/networkStats", () => { + beforeEach(() => { + _resetAllForTest(); + }); + + it("exposes 6 metric names matching the Python distro", () => { + expect(NETWORK_METRIC_NAMES).toEqual([ + REQUEST_SUCCESS_NAME, + REQUEST_FAILURE_NAME, + REQUEST_RETRY_NAME, + REQUEST_THROTTLE_NAME, + REQUEST_EXCEPTION_NAME, + REQUEST_DURATION_NAME, + ]); + expect(REQUEST_SUCCESS_NAME).toBe("request_success_count"); + expect(REQUEST_DURATION_NAME).toBe("request_duration"); + expect(THROTTLE_STATUS_CODES.has(402)).toBe(true); + }); + + it("accumulates success counts per endpoint and reports keys as single-element tuples", () => { + recordSuccess("a.example.com"); + recordSuccess("a.example.com"); + recordSuccess("b.example.com"); + const snap = drain(REQUEST_SUCCESS_NAME); + expect(snap.size).toBe(2); + + const entries = Array.from(snap.entries()).sort(([a], [b]) => a[0].localeCompare(b[0])); + expect(entries[0][0]).toEqual(["a.example.com"]); + expect(entries[0][1]).toBe(2); + expect(entries[1][0]).toEqual(["b.example.com"]); + expect(entries[1][1]).toBe(1); + }); + + it("keys failure/retry/throttle/exception by [endpoint, second-attr]", () => { + recordFailure("a.example.com", 503); + recordFailure("a.example.com", 503); + recordFailure("a.example.com", 502); + recordRetry("a.example.com", 429); + recordThrottle("a.example.com"); + recordException("a.example.com", "AbortError"); + recordException("a.example.com", "AbortError"); + + const failures = drain(REQUEST_FAILURE_NAME); + expect(failures.get(["a.example.com", "503"]) ?? + [...failures.entries()].find(([k]) => k[0] === "a.example.com" && k[1] === "503")?.[1]).toBe( + 2, + ); + // Map equality on tuple keys: identity-based; verify by spreading. + const flat = [...failures.entries()].map(([k, v]) => [k.join("|"), v] as const); + expect(flat).toEqual( + expect.arrayContaining([ + ["a.example.com|503", 2], + ["a.example.com|502", 1], + ]), + ); + + const retries = drain(REQUEST_RETRY_NAME); + expect([...retries.values()]).toEqual([1]); + const [retryKey] = [...retries.keys()]; + expect(retryKey).toEqual(["a.example.com", "429"]); + + const throttles = drain(REQUEST_THROTTLE_NAME); + expect([...throttles.keys()][0]).toEqual(["a.example.com", "402"]); + + const exceptions = drain(REQUEST_EXCEPTION_NAME); + expect([...exceptions.entries()]).toEqual([[["a.example.com", "AbortError"], 2]]); + }); + + it("accumulates duration as a sum of seconds", () => { + recordDuration("a.example.com", 0.25); + recordDuration("a.example.com", 1.0); + recordDuration("b.example.com", 2.5); + const snap = drain(REQUEST_DURATION_NAME); + const flat = Object.fromEntries([...snap.entries()].map(([k, v]) => [k[0], v])); + expect(flat["a.example.com"]).toBeCloseTo(1.25); + expect(flat["b.example.com"]).toBeCloseTo(2.5); + }); + + it("drain() empties the bucket atomically — second drain returns an empty map", () => { + recordSuccess("a.example.com"); + expect(drain(REQUEST_SUCCESS_NAME).size).toBe(1); + expect(drain(REQUEST_SUCCESS_NAME).size).toBe(0); + }); + + it("_resetAllForTest() clears every bucket", () => { + recordSuccess("a.example.com"); + recordFailure("a.example.com", 500); + recordDuration("a.example.com", 1.0); + _resetAllForTest(); + for (const name of NETWORK_METRIC_NAMES) { + expect(drain(name).size).toBe(0); + } + }); +}); diff --git a/test/internal/unit/sdkstats/otlpWrapper.test.ts b/test/internal/unit/sdkstats/otlpWrapper.test.ts new file mode 100644 index 0000000..1a93e06 --- /dev/null +++ b/test/internal/unit/sdkstats/otlpWrapper.test.ts @@ -0,0 +1,218 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { ExportResult } from "@opentelemetry/core"; +import { ExportResultCode } from "@opentelemetry/core"; +import type { ReadableSpan, SpanExporter } from "@opentelemetry/sdk-trace-base"; +import type { PushMetricExporter, ResourceMetrics } from "@opentelemetry/sdk-metrics"; +import type { LogRecordExporter, ReadableLogRecord } from "@opentelemetry/sdk-logs"; + +import { + NetworkStatsLogExporter, + NetworkStatsMetricExporter, + NetworkStatsSpanExporter, +} from "../../../../src/sdkstats/otlpWrapper.js"; +import { + REQUEST_DURATION_NAME, + REQUEST_EXCEPTION_NAME, + REQUEST_FAILURE_NAME, + REQUEST_SUCCESS_NAME, + _resetAllForTest, + drain, +} from "../../../../src/sdkstats/networkStats.js"; + +const HOST = "collector.example.com"; + +function setEndpointEnv(): void { + process.env.OTEL_EXPORTER_OTLP_ENDPOINT = `https://${HOST}:4318`; +} + +function clearEndpointEnv(): void { + delete process.env.OTEL_EXPORTER_OTLP_ENDPOINT; + delete process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT; + delete process.env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT; + delete process.env.OTEL_EXPORTER_OTLP_LOGS_ENDPOINT; +} + +function makeFakeSpanExporter( + result: ExportResult | "throw" | Error, +): SpanExporter & { exported: number } { + return { + exported: 0, + export(_spans: ReadableSpan[], cb: (r: ExportResult) => void): void { + this.exported++; + if (result === "throw") { + throw new TypeError("boom"); + } + if (result instanceof Error) { + cb({ code: ExportResultCode.FAILED, error: result }); + return; + } + cb(result); + }, + shutdown(): Promise { + return Promise.resolve(); + }, + forceFlush(): Promise { + return Promise.resolve(); + }, + }; +} + +describe("sdkstats/otlpWrapper", () => { + beforeEach(() => { + _resetAllForTest(); + setEndpointEnv(); + }); + + afterEach(() => { + _resetAllForTest(); + clearEndpointEnv(); + }); + + describe("NetworkStatsSpanExporter", () => { + it("records success + duration on SUCCESS", async () => { + const inner = makeFakeSpanExporter({ code: ExportResultCode.SUCCESS }); + const wrapper = new NetworkStatsSpanExporter(inner); + + await new Promise((resolve) => + wrapper.export([], (result) => { + expect(result.code).toBe(ExportResultCode.SUCCESS); + resolve(); + }), + ); + expect(inner.exported).toBe(1); + + const success = drain(REQUEST_SUCCESS_NAME); + expect([...success.entries()]).toEqual([[[HOST], 1]]); + const dur = drain(REQUEST_DURATION_NAME); + expect([...dur.keys()][0]).toEqual([HOST]); + }); + + it("records failure(0) + duration on FAILED result (no HTTP status code surfaced)", async () => { + const inner = makeFakeSpanExporter({ code: ExportResultCode.FAILED }); + const wrapper = new NetworkStatsSpanExporter(inner); + await new Promise((resolve) => wrapper.export([], () => resolve())); + + const failure = drain(REQUEST_FAILURE_NAME); + expect([...failure.entries()]).toEqual([[[HOST, "0"], 1]]); + }); + + it("records exception + duration and re-throws on a synchronous throw", async () => { + const inner = makeFakeSpanExporter("throw"); + const wrapper = new NetworkStatsSpanExporter(inner); + + expect(() => wrapper.export([], () => {})).toThrow(TypeError); + const exc = drain(REQUEST_EXCEPTION_NAME); + expect([...exc.entries()]).toEqual([[[HOST, "TypeError"], 1]]); + const dur = drain(REQUEST_DURATION_NAME); + expect(dur.size).toBe(1); + }); + + it("forwards forceFlush and shutdown", async () => { + const inner = makeFakeSpanExporter({ code: ExportResultCode.SUCCESS }); + const flushSpy = vi.spyOn(inner, "forceFlush"); + const shutdownSpy = vi.spyOn(inner, "shutdown"); + const wrapper = new NetworkStatsSpanExporter(inner); + await wrapper.forceFlush(); + await wrapper.shutdown(); + expect(flushSpy).toHaveBeenCalledOnce(); + expect(shutdownSpy).toHaveBeenCalledOnce(); + }); + }); + + describe("NetworkStatsMetricExporter", () => { + function makeMetricExporter(result: ExportResult): PushMetricExporter { + return { + export(_m: ResourceMetrics, cb: (r: ExportResult) => void): void { + cb(result); + }, + forceFlush(): Promise { + return Promise.resolve(); + }, + shutdown(): Promise { + return Promise.resolve(); + }, + selectAggregationTemporality(): 0 { + return 0; + }, + }; + } + + it("records success + duration", async () => { + const wrapper = new NetworkStatsMetricExporter( + makeMetricExporter({ code: ExportResultCode.SUCCESS }), + ); + await new Promise((resolve) => + wrapper.export({} as ResourceMetrics, () => resolve()), + ); + expect([...drain(REQUEST_SUCCESS_NAME).entries()]).toEqual([[[HOST], 1]]); + }); + + it("forwards selectAggregationTemporality only when inner provides it", () => { + const innerWithSelector = makeMetricExporter({ code: ExportResultCode.SUCCESS }); + const wrapperA = new NetworkStatsMetricExporter(innerWithSelector); + expect(typeof wrapperA.selectAggregationTemporality).toBe("function"); + + const innerWithoutSelector: PushMetricExporter = { + export(_m, cb) { + cb({ code: ExportResultCode.SUCCESS }); + }, + forceFlush() { + return Promise.resolve(); + }, + shutdown() { + return Promise.resolve(); + }, + }; + const wrapperB = new NetworkStatsMetricExporter(innerWithoutSelector); + expect(wrapperB.selectAggregationTemporality).toBeUndefined(); + expect(wrapperB.selectAggregation).toBeUndefined(); + }); + }); + + describe("NetworkStatsLogExporter", () => { + function makeLogExporter(result: ExportResult): LogRecordExporter { + return { + export(_l: ReadableLogRecord[], cb: (r: ExportResult) => void): void { + cb(result); + }, + shutdown(): Promise { + return Promise.resolve(); + }, + }; + } + + it("records success + duration on SUCCESS", async () => { + const wrapper = new NetworkStatsLogExporter( + makeLogExporter({ code: ExportResultCode.SUCCESS }), + ); + await new Promise((resolve) => wrapper.export([], () => resolve())); + expect([...drain(REQUEST_SUCCESS_NAME).entries()]).toEqual([[[HOST], 1]]); + }); + + it("records failure(0) on FAILED result", async () => { + const wrapper = new NetworkStatsLogExporter( + makeLogExporter({ code: ExportResultCode.FAILED }), + ); + await new Promise((resolve) => wrapper.export([], () => resolve())); + expect([...drain(REQUEST_FAILURE_NAME).entries()]).toEqual([[[HOST, "0"], 1]]); + }); + }); + + it("falls back to 'unknown' when no OTLP endpoint env vars are set", () => { + clearEndpointEnv(); + const wrapper = new NetworkStatsSpanExporter({ + export: (_s, cb) => cb({ code: ExportResultCode.SUCCESS }), + shutdown: () => Promise.resolve(), + } as SpanExporter); + return new Promise((resolve) => + wrapper.export([], () => { + const success = drain(REQUEST_SUCCESS_NAME); + expect([...success.keys()][0]).toEqual(["unknown"]); + resolve(); + }), + ); + }); +}); From 8cdd2b5eea32d0638f253e0220f04ac2aa7c5321 Mon Sep 17 00:00:00 2001 From: Jackson Weber Date: Sat, 16 May 2026 22:18:26 -0700 Subject: [PATCH 02/14] fix(sdkstats): align with App Insights statsbeat spec; verified end-to-end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backend was accepting our network-statsbeat envelopes with HTTP 200 but silently dropping them from the dashboards because the envelopes were missing spec-mandated dimensions and using ndpoint as a hostname instead of as a category. Metric names (PascalCase per spec): Request_Success_Count, Request_Failure_Count, Retry_Count, Throttle_Count, Exception_Count, Request_Duration. Matches StatsbeatCounter enum in @azure/monitor-opentelemetry-exporter. Dimensions: added required rp (default 'unknown'), attach (default 'Manual'), cikey (parsed from customer connection string). Split endpoint (category: 'otlp'/'a365') from host (short stamp region, port stripped). New exported shortHost() helper mirroring AzMon's getShortHost. Manager: SdkStatsManager.initialize({ networkOnly?, cikey? }). Default short-export interval renamed to APPLICATIONINSIGHTS_STATS_SHORT_EXPORT_INTERVAL with 15-minute default (matches Python's _get_stats_short_export_interval). Added APPLICATIONINSIGHTS_STATS_CONNECTION_STRING override for testing. Distro: parses customer connection string via ConnectionStringParser and threads instrumentationkey to SdkStatsManager. All 814 unit tests pass; verified end-to-end against Microsoft's statsbeat resource (network gauges now visible alongside long-interval Feature/Attach). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .gitignore | 3 + src/a365/exporter/Agent365Exporter.ts | 34 ++--- src/distro/distro.ts | 25 +++- src/sdkstats/index.ts | 1 + src/sdkstats/manager.ts | 56 ++++++-- src/sdkstats/metrics.ts | 32 ++++- src/sdkstats/networkStats.ts | 120 ++++++++++++++---- src/sdkstats/otlpWrapper.ts | 60 ++++----- .../unit/a365/agent365NetworkStats.test.ts | 31 +++-- test/internal/unit/sdkstats/manager.test.ts | 10 +- test/internal/unit/sdkstats/metrics.test.ts | 40 +++--- .../unit/sdkstats/networkStats.test.ts | 67 +++++----- .../unit/sdkstats/otlpWrapper.test.ts | 24 ++-- 13 files changed, 318 insertions(+), 185 deletions(-) diff --git a/.gitignore b/.gitignore index d0b0f76..8e15469 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,6 @@ npm-debug.log* # Temp tmp/ + +# Internal planning doc — keep out of the repo +NETWORK_SDKSTATS_PLAN.md diff --git a/src/a365/exporter/Agent365Exporter.ts b/src/a365/exporter/Agent365Exporter.ts index c958523..1908f7a 100644 --- a/src/a365/exporter/Agent365Exporter.ts +++ b/src/a365/exporter/Agent365Exporter.ts @@ -30,6 +30,7 @@ import { recordRetry, recordSuccess, recordThrottle, + shortHost, } from "../../sdkstats/index.js"; const DEFAULT_MAX_RETRIES = 3; @@ -261,17 +262,15 @@ export class Agent365Exporter implements SpanExporter { ): Promise<{ ok: boolean; correlationId: string }> { let lastCorrelationId = "unknown"; - // Resolve the endpoint host (and the SDKStats kill-switch) once per - // call so each retry attempt records under the same key without - // re-parsing the URL or re-checking env on every iteration. + // Resolve the short host (and the SDKStats kill-switch) once per call + // so each retry attempt records under the same key without re-parsing + // the URL or re-checking env on every iteration. `endpoint` is the + // category label per spec — A365 transmits report endpoint="a365". const recordA365Stats = isSdkStatsEnabled(); - let endpointHost = url; + const endpointCategory = "a365"; + let host = url; if (recordA365Stats) { - try { - endpointHost = new URL(url).hostname || url; - } catch { - endpointHost = url; - } + host = shortHost(url); } for (let attempt = 0; attempt <= DEFAULT_MAX_RETRIES; attempt++) { @@ -285,7 +284,7 @@ export class Agent365Exporter implements SpanExporter { }); if (recordA365Stats) { - recordDuration(endpointHost, (Date.now() - startTime) / 1000); + recordDuration(endpointCategory, host, (Date.now() - startTime) / 1000); } const correlationId = @@ -296,7 +295,7 @@ export class Agent365Exporter implements SpanExporter { if (response.status >= 200 && response.status < 300) { if (recordA365Stats) { - recordSuccess(endpointHost); + recordSuccess(endpointCategory, host); } return { ok: true, correlationId }; } @@ -309,7 +308,7 @@ export class Agent365Exporter implements SpanExporter { if (recordA365Stats) { // 402 (throttle) is not in the retryable set, so it never // lands here — only true retries. - recordRetry(endpointHost, response.status); + recordRetry(endpointCategory, host, response.status); } if (attempt < DEFAULT_MAX_RETRIES) { const sleepMs = 200 * (attempt + 1) + Math.floor(Math.random() * 100); @@ -322,13 +321,13 @@ export class Agent365Exporter implements SpanExporter { // Retries exhausted: also record a final failure so dashboards // see this as a terminal failure (not just a retry blip). if (recordA365Stats) { - recordFailure(endpointHost, response.status); + recordFailure(endpointCategory, host, response.status); } } else if (recordA365Stats) { if (THROTTLE_STATUS_CODES.has(response.status)) { - recordThrottle(endpointHost, response.status); + recordThrottle(endpointCategory, host, response.status); } else { - recordFailure(endpointHost, response.status); + recordFailure(endpointCategory, host, response.status); } } @@ -338,9 +337,10 @@ export class Agent365Exporter implements SpanExporter { return { ok: false, correlationId }; } catch (error) { if (recordA365Stats) { - recordDuration(endpointHost, (Date.now() - startTime) / 1000); + recordDuration(endpointCategory, host, (Date.now() - startTime) / 1000); recordException( - endpointHost, + endpointCategory, + host, error instanceof Error ? error.name || error.constructor.name || "Error" : typeof error, diff --git a/src/distro/distro.ts b/src/distro/distro.ts index ac3bde4..84f68af 100644 --- a/src/distro/distro.ts +++ b/src/distro/distro.ts @@ -20,6 +20,7 @@ import { InternalConfig } from "../shared/config.js"; import { MetricHandler } from "../azureMonitor/metrics/index.js"; import { TraceHandler } from "../azureMonitor/traces/handler.js"; import { LogHandler } from "../azureMonitor/logs/index.js"; +import { ConnectionStringParser } from "../azureMonitor/utils/connectionStringParser.js"; import { AZURE_MONITOR_OPENTELEMETRY_VERSION } from "../types.js"; import { patchOpenTelemetryInstrumentationEnable } from "../utils/opentelemetryInstrumentationPatcher.js"; import { parseResourceDetectorsFromEnvVar } from "../utils/common.js"; @@ -378,19 +379,35 @@ export function useMicrosoftOpenTelemetry(options?: MicrosoftOpenTelemetryOption // ── SDKStats: standalone pipeline ───────────────────────────────── // The standalone pipeline ALWAYS runs so per-export network statsbeat - // (`request_*` gauges) for A365 / OTLP transmits is captured. + // (`Request_*` etc. gauges) for A365 / OTLP transmits is captured. // // - When Azure Monitor is enabled (`networkOnly: true`): only the // network gauges are registered. The Feature / Feature.instrumentations // long-interval statsbeat is owned by the AzMon exporter, with our // distro bits bridged in via `setStatsbeatFeatures` → // `AZURE_MONITOR_STATSBEAT_FEATURES`. Network statsbeat is safe to - // coexist because the `endpoint` attribute partitions the time series - // (AzMon ingestion hosts vs A365 / OTLP hosts). + // coexist because the (endpoint, host) attributes partition the + // time series (AzMon ingestion hosts vs A365 / OTLP hosts). // - When Azure Monitor is disabled: the standalone pipeline owns the // full set (feature + instrumentation + network) and ships them to // the well-known statsbeat endpoint. - void SdkStatsManager.getInstance().initialize({ networkOnly: azureMonitorEnabled }); + // + // `cikey` is reported as a customDimension on every observation per + // the SDKStats spec. Parse it from the customer connection string if + // we have one; empty string otherwise. + const sdkStatsCikey = (() => { + const cs = + config.azureMonitorExporterOptions?.connectionString ?? + process.env["APPLICATIONINSIGHTS_CONNECTION_STRING"] ?? + ""; + if (!cs) return ""; + const parsed = ConnectionStringParser.parse(cs); + return parsed.instrumentationkey ?? ""; + })(); + void SdkStatsManager.getInstance().initialize({ + networkOnly: azureMonitorEnabled, + cikey: sdkStatsCikey, + }); // Initialize GenAI instrumentations after providers are registered so any // tracer they capture is backed by the active SDK provider. diff --git a/src/sdkstats/index.ts b/src/sdkstats/index.ts index a1befbe..e8d8d84 100644 --- a/src/sdkstats/index.ts +++ b/src/sdkstats/index.ts @@ -50,6 +50,7 @@ export { recordException, recordDuration, drain, + shortHost, _resetAllForTest as _resetNetworkStatsForTest, } from "./networkStats.js"; export type { NetworkMetricName, NetworkKey } from "./networkStats.js"; diff --git a/src/sdkstats/manager.ts b/src/sdkstats/manager.ts index e44a5da..47cf6a9 100644 --- a/src/sdkstats/manager.ts +++ b/src/sdkstats/manager.ts @@ -30,22 +30,42 @@ import { isSdkStatsEnabled, setSdkStatsShutdown } from "./state.js"; import { SdkStatsMetrics } from "./metrics.js"; /** - * Default long export interval (24 hours) for Feature/Instrumentation - * SDKStats per the Application Insights SDKStats specification. + * Default short export interval (15 minutes) for the standalone SDKStats + * pipeline. This matches the Application Insights statsbeat + * short-interval cadence used by the network statsbeat counters and the + * Python distro (`_get_stats_short_export_interval()` in + * `azure.monitor.opentelemetry.exporter.statsbeat._utils`). + * + * The pipeline emits both Feature/Feature.instrumentations gauges + * (when not in `networkOnly` mode) and the six `request_*` network + * gauges; the network counters dominate cadence requirements, so the + * single shared interval defaults to short rather than long. * * @internal */ -const DEFAULT_LONG_EXPORT_INTERVAL_MS = 24 * 60 * 60 * 1000; +const DEFAULT_SHORT_EXPORT_INTERVAL_MS = 15 * 60 * 1000; /** - * Override env var: long export interval in seconds, per the spec. + * Override env var: standalone SDKStats export interval in seconds. + * Matches the Python distro env var name. * * @internal */ -const SDKSTATS_LONG_EXPORT_INTERVAL_ENV = "APPLICATIONINSIGHTS_STATS_LONG_EXPORT_INTERVAL"; +const SDKSTATS_SHORT_EXPORT_INTERVAL_ENV = "APPLICATIONINSIGHTS_STATS_SHORT_EXPORT_INTERVAL"; /** - * Initial-export delay (15 seconds) before the first long-interval flush. + * Override env var: redirect SDKStats envelopes to a custom App + * Insights connection string. When unset, SDKStats flow to the + * Microsoft-owned statsbeat resource (`NON_EU_CONNECTION_STRING` in + * the AzMon exporter package). Primarily useful for testing. + * Matches the Python distro env var name. + * + * @internal + */ +const SDKSTATS_CONNECTION_STRING_ENV = "APPLICATIONINSIGHTS_STATS_CONNECTION_STRING"; + +/** + * Initial-export delay (15 seconds) before the first flush. * * The spec recommends this delay specifically for the Node.js SDK to * avoid short-running CLI-style applications generating excess SDKStats @@ -89,12 +109,17 @@ export class SdkStatsManager { * path because the AzMon exporter's own long-interval statsbeat * already emits those gauges (with our distro bits bridged in via * `AZURE_MONITOR_STATSBEAT_FEATURES`). + * @param options.cikey Customer iKey to report as the `cikey` + * customDimension on every observation. Required by the SDKStats + * spec; pass an empty string only if no customer iKey is available. * * Returns `true` if the standalone pipeline was initialized (or was * already initialized), `false` if SDKStats are disabled via env var * or initialization failed. */ - async initialize(options: { networkOnly?: boolean } = {}): Promise { + async initialize( + options: { networkOnly?: boolean; cikey?: string } = {}, + ): Promise { if (!isSdkStatsEnabled()) { return false; } @@ -127,8 +152,16 @@ export class SdkStatsManager { const AzureMonitorStatsbeatExporter = statsbeatExporterModule.AzureMonitorStatsbeatExporter; const NON_EU_CONNECTION_STRING = statsbeatTypesModule.NON_EU_CONNECTION_STRING; + // Allow overriding the SDKStats ingestion target via env var, + // matching the Python distro's APPLICATIONINSIGHTS_STATS_CONNECTION_STRING + // hook. Primarily useful for testing — production should leave + // this unset so SDKStats flows to the Microsoft-owned statsbeat + // resource (NON_EU_CONNECTION_STRING). + const connectionString = + process.env[SDKSTATS_CONNECTION_STRING_ENV] ?? NON_EU_CONNECTION_STRING; + const exporter = new AzureMonitorStatsbeatExporter({ - connectionString: NON_EU_CONNECTION_STRING, + connectionString, disableOfflineStorage: true, }); @@ -143,6 +176,7 @@ export class SdkStatsManager { }); this._metrics = new SdkStatsMetrics(this._meterProvider, { networkOnly: options.networkOnly, + cikey: options.cikey, }); this._initialized = true; setSdkStatsShutdown(false); @@ -210,11 +244,11 @@ export class SdkStatsManager { } function resolveExportInterval(): number { - const raw = process.env[SDKSTATS_LONG_EXPORT_INTERVAL_ENV]; - if (!raw) return DEFAULT_LONG_EXPORT_INTERVAL_MS; + const raw = process.env[SDKSTATS_SHORT_EXPORT_INTERVAL_ENV]; + if (!raw) return DEFAULT_SHORT_EXPORT_INTERVAL_MS; const seconds = Number(raw); if (!Number.isFinite(seconds) || seconds <= 0) { - return DEFAULT_LONG_EXPORT_INTERVAL_MS; + return DEFAULT_SHORT_EXPORT_INTERVAL_MS; } return Math.floor(seconds * 1000); } diff --git a/src/sdkstats/metrics.ts b/src/sdkstats/metrics.ts index 537795a..2f29be4 100644 --- a/src/sdkstats/metrics.ts +++ b/src/sdkstats/metrics.ts @@ -107,6 +107,12 @@ if (NETWORK_GAUGE_SPECS.length !== NETWORK_METRIC_NAMES.length) { export interface SdkStatsMetricsOptions { /** Override the distro version reported on every observation. */ distroVersion?: string; + /** + * Customer instrumentation key emitted as the `cikey` customDimension + * on every SDKStats observation, per the Application Insights SDKStats + * spec. Pass an empty string when no customer iKey is available. + */ + cikey?: string; /** * When `true`, skip the Feature / Feature.instrumentations gauges. Used * on the Azure-Monitor-enabled path because the AzMon exporter's own @@ -114,10 +120,10 @@ export interface SdkStatsMetricsOptions { * bits bridged in via `AZURE_MONITOR_STATSBEAT_FEATURES`); registering * them here would double-count. * - * The six network statsbeat gauges (`request_*`) are always registered - * regardless of this flag — coexistence with AzMon's own network - * statsbeat is safe because the `endpoint` attribute partitions the - * series by destination host. + * The six network statsbeat gauges (`Request_*` etc.) are always + * registered regardless of this flag — coexistence with AzMon's own + * network statsbeat is safe because the (endpoint, host) attributes + * partition the time series. */ networkOnly?: boolean; } @@ -131,10 +137,18 @@ export class SdkStatsMetrics { private readonly commonAttributes: Record; constructor(meterProvider: MeterProvider, options: SdkStatsMetricsOptions = {}) { - const { distroVersion, networkOnly = false } = options; + const { distroVersion, networkOnly = false, cikey = "" } = options; const meter = meterProvider.getMeter("microsoft.opentelemetry.sdkstats"); + // Per spec/sdkstats.md the required customDimensions on every + // SDKStats observation are: rp, attach, cikey, runtimeVersion, os, + // language, version (plus endpoint/host on network gauges and + // statusCode/exceptionType where applicable). Missing dimensions + // cause envelopes to be silently dropped on the backend. this.commonAttributes = { + rp: "unknown", + attach: "Manual", + cikey, runtimeVersion: process.version, os: os.type(), language: STATSBEAT_LANGUAGE, @@ -200,12 +214,16 @@ export class SdkStatsMetrics { private makeNetworkCallback(spec: NetworkGaugeSpec): (result: ObservableResult) => void { return (result: ObservableResult): void => { for (const [key, value] of drain(spec.metric)) { + // Key layout (from networkStats.ts): + // [endpoint, host] → success / duration + // [endpoint, host, statusCode|exceptionType] → others const attrs: Record = { ...this.commonAttributes, endpoint: key[0], + host: key[1], }; - if (spec.secondAttr && key.length === 2) { - attrs[spec.secondAttr] = key[1]; + if (spec.secondAttr && key.length === 3) { + attrs[spec.secondAttr] = key[2]; } result.observe(value, attrs); } diff --git a/src/sdkstats/networkStats.ts b/src/sdkstats/networkStats.ts index 28101ef..474d96f 100644 --- a/src/sdkstats/networkStats.ts +++ b/src/sdkstats/networkStats.ts @@ -21,12 +21,19 @@ */ export const THROTTLE_STATUS_CODES: ReadonlySet = new Set([402]); -export const REQUEST_SUCCESS_NAME = "request_success_count"; -export const REQUEST_FAILURE_NAME = "request_failure_count"; -export const REQUEST_RETRY_NAME = "request_retry_count"; -export const REQUEST_THROTTLE_NAME = "request_throttle_count"; -export const REQUEST_EXCEPTION_NAME = "request_exception_count"; -export const REQUEST_DURATION_NAME = "request_duration"; +// Metric names must match the AzMon statsbeat backend's recognized +// schema (see `StatsbeatCounter` enum in +// `@azure/monitor-opentelemetry-exporter/dist/esm/export/statsbeat/types.js`). +// Sending envelopes under any other name returns HTTP 200 but the +// backend doesn't index them, so they're invisible in the statsbeat +// dashboards. The constants below intentionally match the wire-format +// names — do NOT rename them to lowercase. +export const REQUEST_SUCCESS_NAME = "Request_Success_Count"; +export const REQUEST_FAILURE_NAME = "Request_Failure_Count"; +export const REQUEST_RETRY_NAME = "Retry_Count"; +export const REQUEST_THROTTLE_NAME = "Throttle_Count"; +export const REQUEST_EXCEPTION_NAME = "Exception_Count"; +export const REQUEST_DURATION_NAME = "Request_Duration"; /** * Names of all six network statsbeat metrics, in registration order. @@ -47,13 +54,19 @@ export type NetworkMetricName = (typeof NETWORK_METRIC_NAMES)[number]; /** * Composite key for an aggregated network statsbeat counter. * - * - Single-element tuples key on `endpoint` only (success / duration). - * - Two-element tuples key on `[endpoint, statusCode | exceptionType]` + * Per the Application Insights SDKStats spec the per-key dimensions are + * `endpoint` (category, e.g. "otlp", "a365"), `host` (stamp-specific + * region or hostname), and optionally `statusCode` / `exceptionType`. + * + * - 2-element tuples key on `[endpoint, host]` (success / duration). + * - 3-element tuples key on `[endpoint, host, statusCode | exceptionType]` * (failure / retry / throttle / exception). * * @internal */ -export type NetworkKey = readonly [string] | readonly [string, string]; +export type NetworkKey = + | readonly [string, string] + | readonly [string, string, string]; // Single-threaded JS execution → no lock needed (Python uses one because of // the GIL + threads; Node.js doesn't share JS objects across worker threads). @@ -72,13 +85,13 @@ const REQUESTS_MAP: Record> = { const KEY_SEPARATOR = "\u0000"; function encodeKey(key: NetworkKey): string { - return key.length === 1 ? key[0] : `${key[0]}${KEY_SEPARATOR}${key[1]}`; + return key.join(KEY_SEPARATOR); } function decodeKey(encoded: string): NetworkKey { - const sep = encoded.indexOf(KEY_SEPARATOR); - if (sep < 0) return [encoded] as const; - return [encoded.slice(0, sep), encoded.slice(sep + 1)] as const; + const parts = encoded.split(KEY_SEPARATOR); + if (parts.length === 2) return [parts[0], parts[1]] as const; + return [parts[0], parts[1], parts[2]] as const; } function bump(metric: NetworkMetricName, key: NetworkKey, value = 1): void { @@ -87,28 +100,85 @@ function bump(metric: NetworkMetricName, key: NetworkKey, value = 1): void { bucket.set(encoded, (bucket.get(encoded) ?? 0) + value); } -export function recordSuccess(endpoint: string): void { - bump(REQUEST_SUCCESS_NAME, [endpoint]); +export function recordSuccess(endpoint: string, host: string): void { + bump(REQUEST_SUCCESS_NAME, [endpoint, host]); +} + +export function recordFailure( + endpoint: string, + host: string, + statusCode: number | string, +): void { + bump(REQUEST_FAILURE_NAME, [endpoint, host, String(statusCode)]); } -export function recordFailure(endpoint: string, statusCode: number | string): void { - bump(REQUEST_FAILURE_NAME, [endpoint, String(statusCode)]); +export function recordRetry( + endpoint: string, + host: string, + statusCode: number | string, +): void { + bump(REQUEST_RETRY_NAME, [endpoint, host, String(statusCode)]); } -export function recordRetry(endpoint: string, statusCode: number | string): void { - bump(REQUEST_RETRY_NAME, [endpoint, String(statusCode)]); +export function recordThrottle( + endpoint: string, + host: string, + statusCode: number | string = 402, +): void { + bump(REQUEST_THROTTLE_NAME, [endpoint, host, String(statusCode)]); } -export function recordThrottle(endpoint: string, statusCode: number | string = 402): void { - bump(REQUEST_THROTTLE_NAME, [endpoint, String(statusCode)]); +export function recordException( + endpoint: string, + host: string, + exceptionType: string, +): void { + bump(REQUEST_EXCEPTION_NAME, [endpoint, host, exceptionType]); } -export function recordException(endpoint: string, exceptionType: string): void { - bump(REQUEST_EXCEPTION_NAME, [endpoint, exceptionType]); +export function recordDuration( + endpoint: string, + host: string, + durationSeconds: number, +): void { + bump(REQUEST_DURATION_NAME, [endpoint, host], durationSeconds); } -export function recordDuration(endpoint: string, durationSeconds: number): void { - bump(REQUEST_DURATION_NAME, [endpoint], durationSeconds); +/** + * Compute the stamp-specific short host for the SDKStats `host` dimension. + * + * Mirrors `getShortHost` in the AzMon exporter's `NetworkStatsbeatMetrics` + * but additionally strips any trailing port (`:4318`) so localhost-style + * URLs report a clean `localhost` instead of `localhost:4318`. Examples: + * `https://westus2-1.in.applicationinsights.azure.com` → `westus2` + * `http://localhost:4318/v1/traces` → `localhost` + * `https://collector.example.com:8080` → `collector` + * For non-URL inputs, returns the hostname or the raw input on failure. + * + * @internal + */ +export function shortHost(input: string): string { + if (!input) return "unknown"; + let host = input; + try { + const hostRegex = /^https?:\/\/(?:www\.)?([^/.-]+)/; + const res = hostRegex.exec(input); + if (res && res.length > 1) { + host = res[1]; + } else { + try { + host = new URL(input).hostname || input; + } catch { + host = input; + } + } + host = host.replace(".in.applicationinsights.azure.com", ""); + const colon = host.indexOf(":"); + if (colon > 0) host = host.slice(0, colon); + } catch { + /* fall through */ + } + return host; } /** diff --git a/src/sdkstats/otlpWrapper.ts b/src/sdkstats/otlpWrapper.ts index 1980d51..f465e1b 100644 --- a/src/sdkstats/otlpWrapper.ts +++ b/src/sdkstats/otlpWrapper.ts @@ -30,10 +30,14 @@ import { recordException, recordFailure, recordSuccess, + shortHost, } from "./networkStats.js"; +/** Per spec, `endpoint` is a category label, not the destination URL. */ +const OTLP_ENDPOINT_CATEGORY = "otlp"; + /** - * Resolve the destination hostname for a given OTLP signal. + * Resolve the short-host string for a given OTLP signal. * * The OTel HTTP exporters do not expose their endpoint on a stable public * field, so we read the same env-var precedence the exporters themselves @@ -41,7 +45,7 @@ import { * Falls back to `"unknown"` when no endpoint can be resolved (e.g. fully * programmatic config without env vars). */ -function resolveEndpointHost(signal: "traces" | "metrics" | "logs"): string { +function resolveShortHost(signal: "traces" | "metrics" | "logs"): string { const signalSpecific = signal === "traces" ? "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" @@ -51,12 +55,7 @@ function resolveEndpointHost(signal: "traces" | "metrics" | "logs"): string { const raw = process.env[signalSpecific] ?? process.env.OTEL_EXPORTER_OTLP_ENDPOINT; if (!raw) return "unknown"; - - try { - return new URL(raw).hostname || raw; - } catch { - return raw; - } + return shortHost(raw); } /** @@ -70,7 +69,7 @@ function resolveEndpointHost(signal: "traces" | "metrics" | "logs"): string { * exceptions keyed by the error class name. */ function wrapExport( - endpoint: string, + host: string, inner: (resultCallback: (result: ExportResult) => void) => void, resultCallback: (result: ExportResult) => void, _items: T, @@ -80,13 +79,13 @@ function wrapExport( const settle = (result: ExportResult): void => { if (settled) return; settled = true; - recordDuration(endpoint, (Date.now() - start) / 1000); + recordDuration(OTLP_ENDPOINT_CATEGORY, host, (Date.now() - start) / 1000); if (result.code === ExportResultCode.SUCCESS) { - recordSuccess(endpoint); + recordSuccess(OTLP_ENDPOINT_CATEGORY, host); } else { // The HTTP exporters don't expose an HTTP status code, so record // failures with statusCode=0 (matches Python distro). - recordFailure(endpoint, 0); + recordFailure(OTLP_ENDPOINT_CATEGORY, host, 0); } resultCallback(result); }; @@ -95,8 +94,8 @@ function wrapExport( inner(settle); } catch (err) { settled = true; - recordDuration(endpoint, (Date.now() - start) / 1000); - recordException(endpoint, errorName(err)); + recordDuration(OTLP_ENDPOINT_CATEGORY, host, (Date.now() - start) / 1000); + recordException(OTLP_ENDPOINT_CATEGORY, host, errorName(err)); throw err; } } @@ -112,19 +111,14 @@ function errorName(err: unknown): string { * Span exporter decorator that records network statsbeat counts. */ export class NetworkStatsSpanExporter implements SpanExporter { - private readonly endpoint: string; + private readonly host: string; constructor(private readonly inner: SpanExporter) { - this.endpoint = resolveEndpointHost("traces"); + this.host = resolveShortHost("traces"); } export(spans: ReadableSpan[], resultCallback: (result: ExportResult) => void): void { - wrapExport( - this.endpoint, - (cb) => this.inner.export(spans, cb), - resultCallback, - spans, - ); + wrapExport(this.host, (cb) => this.inner.export(spans, cb), resultCallback, spans); } shutdown(): Promise { @@ -145,12 +139,12 @@ export class NetworkStatsSpanExporter implements SpanExporter { * for exporters that don't. */ export class NetworkStatsMetricExporter implements PushMetricExporter { - private readonly endpoint: string; + private readonly host: string; selectAggregationTemporality?: (instrumentType: InstrumentType) => AggregationTemporality; selectAggregation?: (instrumentType: InstrumentType) => AggregationOption; constructor(private readonly inner: PushMetricExporter) { - this.endpoint = resolveEndpointHost("metrics"); + this.host = resolveShortHost("metrics"); if (inner.selectAggregationTemporality) { this.selectAggregationTemporality = (t) => inner.selectAggregationTemporality!(t); } @@ -160,12 +154,7 @@ export class NetworkStatsMetricExporter implements PushMetricExporter { } export(metrics: ResourceMetrics, resultCallback: (result: ExportResult) => void): void { - wrapExport( - this.endpoint, - (cb) => this.inner.export(metrics, cb), - resultCallback, - metrics, - ); + wrapExport(this.host, (cb) => this.inner.export(metrics, cb), resultCallback, metrics); } forceFlush(): Promise { @@ -181,19 +170,14 @@ export class NetworkStatsMetricExporter implements PushMetricExporter { * Log exporter decorator that records network statsbeat counts. */ export class NetworkStatsLogExporter implements LogRecordExporter { - private readonly endpoint: string; + private readonly host: string; constructor(private readonly inner: LogRecordExporter) { - this.endpoint = resolveEndpointHost("logs"); + this.host = resolveShortHost("logs"); } export(logs: ReadableLogRecord[], resultCallback: (result: ExportResult) => void): void { - wrapExport( - this.endpoint, - (cb) => this.inner.export(logs, cb), - resultCallback, - logs, - ); + wrapExport(this.host, (cb) => this.inner.export(logs, cb), resultCallback, logs); } shutdown(): Promise { diff --git a/test/internal/unit/a365/agent365NetworkStats.test.ts b/test/internal/unit/a365/agent365NetworkStats.test.ts index 4f31e1b..d1415b6 100644 --- a/test/internal/unit/a365/agent365NetworkStats.test.ts +++ b/test/internal/unit/a365/agent365NetworkStats.test.ts @@ -15,6 +15,7 @@ import { REQUEST_THROTTLE_NAME, _resetAllForTest, drain, + shortHost as _shortHost, } from "../../../../src/sdkstats/networkStats.js"; import { _resetA365LoggerForTest } from "../../../../src/a365/logging.js"; @@ -58,18 +59,16 @@ function exportSpan(exporter: Agent365Exporter): Promise { function fetchHost(): string { // Whatever URL `Agent365Exporter` POSTs to in the default config — we - // pluck it from the captured fetch args so the test never has to know - // the Agent365 endpoint resolution rules. + // pluck it from the captured fetch args and pass through the same + // shortHost() transform the production code uses. const calls = (globalThis.fetch as unknown as { mock?: { calls: unknown[][] } }).mock?.calls ?? []; if (calls.length === 0) return "unknown"; const url = calls[0][0] as string; - try { - return new URL(url).hostname; - } catch { - return url; - } + return _shortHost(url); } +const ENDPOINT = "a365"; + describe("Agent365Exporter network statsbeat", () => { let fetchSpy: ReturnType; @@ -97,10 +96,10 @@ describe("Agent365Exporter network statsbeat", () => { await exportSpan(exporter); const host = fetchHost(); - expect([...drain(REQUEST_SUCCESS_NAME).entries()]).toEqual([[[host], 1]]); + expect([...drain(REQUEST_SUCCESS_NAME).entries()]).toEqual([[[ENDPOINT, host], 1]]); const dur = drain(REQUEST_DURATION_NAME); - expect([...dur.keys()][0]).toEqual([host]); - expect((dur.get([host]) ?? [...dur.values()][0])).toBeGreaterThanOrEqual(0); + expect([...dur.keys()][0]).toEqual([ENDPOINT, host]); + expect((dur.get([ENDPOINT, host]) ?? [...dur.values()][0])).toBeGreaterThanOrEqual(0); }); it("records request_retry_count for every retryable response and a final request_failure_count when retries are exhausted", async () => { @@ -116,9 +115,9 @@ describe("Agent365Exporter network statsbeat", () => { const host = fetchHost(); const retries = drain(REQUEST_RETRY_NAME); - expect([...retries.entries()]).toEqual([[[host, "503"], 4]]); + expect([...retries.entries()]).toEqual([[[ENDPOINT, host, "503"], 4]]); const failures = drain(REQUEST_FAILURE_NAME); - expect([...failures.entries()]).toEqual([[[host, "503"], 1]]); + expect([...failures.entries()]).toEqual([[[ENDPOINT, host, "503"], 1]]); }); it("records request_failure_count for non-retryable, non-throttle status codes", async () => { @@ -127,7 +126,7 @@ describe("Agent365Exporter network statsbeat", () => { await exportSpan(exporter); const host = fetchHost(); - expect([...drain(REQUEST_FAILURE_NAME).entries()]).toEqual([[[host, "404"], 1]]); + expect([...drain(REQUEST_FAILURE_NAME).entries()]).toEqual([[[ENDPOINT, host, "404"], 1]]); expect(drain(REQUEST_RETRY_NAME).size).toBe(0); }); @@ -137,7 +136,7 @@ describe("Agent365Exporter network statsbeat", () => { await exportSpan(exporter); const host = fetchHost(); - expect([...drain(REQUEST_THROTTLE_NAME).entries()]).toEqual([[[host, "402"], 1]]); + expect([...drain(REQUEST_THROTTLE_NAME).entries()]).toEqual([[[ENDPOINT, host, "402"], 1]]); expect(drain(REQUEST_FAILURE_NAME).size).toBe(0); }); @@ -156,9 +155,9 @@ describe("Agent365Exporter network statsbeat", () => { const host = fetchHost(); const exceptions = drain(REQUEST_EXCEPTION_NAME); - expect([...exceptions.entries()]).toEqual([[[host, "AbortError"], 4]]); + expect([...exceptions.entries()]).toEqual([[[ENDPOINT, host, "AbortError"], 4]]); const durations = drain(REQUEST_DURATION_NAME); - expect([...durations.keys()][0]).toEqual([host]); + expect([...durations.keys()][0]).toEqual([ENDPOINT, host]); }); it("records nothing when MICROSOFT_OTEL_SDKSTATS_DISABLED=true", async () => { diff --git a/test/internal/unit/sdkstats/manager.test.ts b/test/internal/unit/sdkstats/manager.test.ts index e34b85b..e8ee103 100644 --- a/test/internal/unit/sdkstats/manager.test.ts +++ b/test/internal/unit/sdkstats/manager.test.ts @@ -55,7 +55,7 @@ describe("sdkstats/manager", () => { expect(await manager.shutdown()).toBe(false); }); - it("uses the spec-compliant 24h long-export interval by default", async () => { + it("uses the spec-compliant 15-minute short-export interval by default", async () => { const manager = SdkStatsManager.getInstance(); await manager.initialize(); // Reach into the private MeterProvider's reader to confirm interval. @@ -63,11 +63,11 @@ describe("sdkstats/manager", () => { const provider = (manager as any)._meterProvider; const reader = provider?._sharedState?.metricCollectors?.[0]?._metricReader; const intervalMs = reader?._exportInterval; - expect(intervalMs).toBe(24 * 60 * 60 * 1000); + expect(intervalMs).toBe(15 * 60 * 1000); }); - it("honours APPLICATIONINSIGHTS_STATS_LONG_EXPORT_INTERVAL override (seconds)", async () => { - process.env["APPLICATIONINSIGHTS_STATS_LONG_EXPORT_INTERVAL"] = "60"; + it("honours APPLICATIONINSIGHTS_STATS_SHORT_EXPORT_INTERVAL override (seconds)", async () => { + process.env["APPLICATIONINSIGHTS_STATS_SHORT_EXPORT_INTERVAL"] = "60"; try { const manager = SdkStatsManager.getInstance(); await manager.initialize(); @@ -76,7 +76,7 @@ describe("sdkstats/manager", () => { const reader = provider?._sharedState?.metricCollectors?.[0]?._metricReader; expect(reader?._exportInterval).toBe(60_000); } finally { - delete process.env["APPLICATIONINSIGHTS_STATS_LONG_EXPORT_INTERVAL"]; + delete process.env["APPLICATIONINSIGHTS_STATS_SHORT_EXPORT_INTERVAL"]; } }); diff --git a/test/internal/unit/sdkstats/metrics.test.ts b/test/internal/unit/sdkstats/metrics.test.ts index 663b865..132fdc2 100644 --- a/test/internal/unit/sdkstats/metrics.test.ts +++ b/test/internal/unit/sdkstats/metrics.test.ts @@ -175,7 +175,7 @@ describe("sdkstats/metrics", () => { setSdkStatsInstrumentation(StatsbeatInstrumentation.MONGODB); // Drop a network counter so a request_success_count observation will fire. _resetNetworkStatsForTest(); - recordSuccess("contoso.example.com"); + recordSuccess("a365", "contoso.example.com"); const { PeriodicExportingMetricReader } = await import("@opentelemetry/sdk-metrics"); const exporter = new InMemoryMetricExporter(AggregationTemporality.CUMULATIVE); @@ -203,16 +203,16 @@ describe("sdkstats/metrics", () => { }); describe("network gauges (default mode)", () => { - it("emits one observation per drained key, attaches endpoint + statusCode/exceptionType, and clears after collection", async () => { + it("emits one observation per drained key, attaches endpoint + host + statusCode/exceptionType, and clears after collection", async () => { _resetNetworkStatsForTest(); - recordSuccess("a365.example.com"); - recordSuccess("a365.example.com"); - recordFailure("a365.example.com", 503); - recordRetry("a365.example.com", 503); - recordRetry("a365.example.com", 503); - recordThrottle("otlp.example.com", 402); - recordException("otlp.example.com", "AbortError"); - recordDuration("a365.example.com", 1.25); + recordSuccess("a365", "a365.example.com"); + recordSuccess("a365", "a365.example.com"); + recordFailure("a365", "a365.example.com", 503); + recordRetry("a365", "a365.example.com", 503); + recordRetry("a365", "a365.example.com", 503); + recordThrottle("otlp", "otlp.example.com", 402); + recordException("otlp", "otlp.example.com", "AbortError"); + recordDuration("a365", "a365.example.com", 1.25); const { PeriodicExportingMetricReader } = await import("@opentelemetry/sdk-metrics"); const exporter = new InMemoryMetricExporter(AggregationTemporality.CUMULATIVE); @@ -235,13 +235,15 @@ describe("sdkstats/metrics", () => { const success = byName(REQUEST_SUCCESS_NAME); expect(success).toHaveLength(1); expect(success[0].value).toBe(2); - expect(success[0].attributes.endpoint).toBe("a365.example.com"); + expect(success[0].attributes.endpoint).toBe("a365"); + expect(success[0].attributes.host).toBe("a365.example.com"); expect(success[0].attributes.statusCode).toBeUndefined(); const failure = byName(REQUEST_FAILURE_NAME); expect(failure).toHaveLength(1); expect(failure[0].value).toBe(1); - expect(failure[0].attributes.endpoint).toBe("a365.example.com"); + expect(failure[0].attributes.endpoint).toBe("a365"); + expect(failure[0].attributes.host).toBe("a365.example.com"); expect(failure[0].attributes.statusCode).toBe("503"); const retry = byName(REQUEST_RETRY_NAME); @@ -251,7 +253,8 @@ describe("sdkstats/metrics", () => { const throttle = byName(REQUEST_THROTTLE_NAME); expect(throttle).toHaveLength(1); - expect(throttle[0].attributes.endpoint).toBe("otlp.example.com"); + expect(throttle[0].attributes.endpoint).toBe("otlp"); + expect(throttle[0].attributes.host).toBe("otlp.example.com"); expect(throttle[0].attributes.statusCode).toBe("402"); const exception = byName(REQUEST_EXCEPTION_NAME); @@ -262,10 +265,13 @@ describe("sdkstats/metrics", () => { expect(duration).toHaveLength(1); expect(duration[0].value).toBeCloseTo(1.25); - // Second flush after a reset & drain: drain semantics are covered - // in networkStats.test.ts; observable gauges may legitimately repeat - // their last value under CUMULATIVE aggregation depending on the - // SDK's caching, so we don't assert "empty" here. + // Common dimensions per spec. + for (const dp of [...success, ...failure, ...retry, ...throttle, ...exception, ...duration]) { + expect(dp.attributes.rp).toBe("unknown"); + expect(dp.attributes.attach).toBe("Manual"); + expect(dp.attributes.cikey).toBe(""); + expect(dp.attributes.language).toBe("node"); + } await meterProvider.shutdown(); _resetNetworkStatsForTest(); diff --git a/test/internal/unit/sdkstats/networkStats.test.ts b/test/internal/unit/sdkstats/networkStats.test.ts index cf10831..25d6a9e 100644 --- a/test/internal/unit/sdkstats/networkStats.test.ts +++ b/test/internal/unit/sdkstats/networkStats.test.ts @@ -36,80 +36,77 @@ describe("sdkstats/networkStats", () => { REQUEST_EXCEPTION_NAME, REQUEST_DURATION_NAME, ]); - expect(REQUEST_SUCCESS_NAME).toBe("request_success_count"); - expect(REQUEST_DURATION_NAME).toBe("request_duration"); + expect(REQUEST_SUCCESS_NAME).toBe("Request_Success_Count"); + expect(REQUEST_DURATION_NAME).toBe("Request_Duration"); expect(THROTTLE_STATUS_CODES.has(402)).toBe(true); }); - it("accumulates success counts per endpoint and reports keys as single-element tuples", () => { - recordSuccess("a.example.com"); - recordSuccess("a.example.com"); - recordSuccess("b.example.com"); + it("accumulates success counts per (endpoint, host) and reports keys as two-element tuples", () => { + recordSuccess("otlp", "a.example.com"); + recordSuccess("otlp", "a.example.com"); + recordSuccess("otlp", "b.example.com"); const snap = drain(REQUEST_SUCCESS_NAME); expect(snap.size).toBe(2); - const entries = Array.from(snap.entries()).sort(([a], [b]) => a[0].localeCompare(b[0])); - expect(entries[0][0]).toEqual(["a.example.com"]); + const entries = Array.from(snap.entries()).sort(([a], [b]) => a[1].localeCompare(b[1])); + expect(entries[0][0]).toEqual(["otlp", "a.example.com"]); expect(entries[0][1]).toBe(2); - expect(entries[1][0]).toEqual(["b.example.com"]); + expect(entries[1][0]).toEqual(["otlp", "b.example.com"]); expect(entries[1][1]).toBe(1); }); - it("keys failure/retry/throttle/exception by [endpoint, second-attr]", () => { - recordFailure("a.example.com", 503); - recordFailure("a.example.com", 503); - recordFailure("a.example.com", 502); - recordRetry("a.example.com", 429); - recordThrottle("a.example.com"); - recordException("a.example.com", "AbortError"); - recordException("a.example.com", "AbortError"); + it("keys failure/retry/throttle/exception by [endpoint, host, second-attr]", () => { + recordFailure("otlp", "a.example.com", 503); + recordFailure("otlp", "a.example.com", 503); + recordFailure("otlp", "a.example.com", 502); + recordRetry("otlp", "a.example.com", 429); + recordThrottle("otlp", "a.example.com"); + recordException("otlp", "a.example.com", "AbortError"); + recordException("otlp", "a.example.com", "AbortError"); const failures = drain(REQUEST_FAILURE_NAME); - expect(failures.get(["a.example.com", "503"]) ?? - [...failures.entries()].find(([k]) => k[0] === "a.example.com" && k[1] === "503")?.[1]).toBe( - 2, - ); - // Map equality on tuple keys: identity-based; verify by spreading. const flat = [...failures.entries()].map(([k, v]) => [k.join("|"), v] as const); expect(flat).toEqual( expect.arrayContaining([ - ["a.example.com|503", 2], - ["a.example.com|502", 1], + ["otlp|a.example.com|503", 2], + ["otlp|a.example.com|502", 1], ]), ); const retries = drain(REQUEST_RETRY_NAME); expect([...retries.values()]).toEqual([1]); const [retryKey] = [...retries.keys()]; - expect(retryKey).toEqual(["a.example.com", "429"]); + expect(retryKey).toEqual(["otlp", "a.example.com", "429"]); const throttles = drain(REQUEST_THROTTLE_NAME); - expect([...throttles.keys()][0]).toEqual(["a.example.com", "402"]); + expect([...throttles.keys()][0]).toEqual(["otlp", "a.example.com", "402"]); const exceptions = drain(REQUEST_EXCEPTION_NAME); - expect([...exceptions.entries()]).toEqual([[["a.example.com", "AbortError"], 2]]); + expect([...exceptions.entries()]).toEqual([ + [["otlp", "a.example.com", "AbortError"], 2], + ]); }); it("accumulates duration as a sum of seconds", () => { - recordDuration("a.example.com", 0.25); - recordDuration("a.example.com", 1.0); - recordDuration("b.example.com", 2.5); + recordDuration("otlp", "a.example.com", 0.25); + recordDuration("otlp", "a.example.com", 1.0); + recordDuration("otlp", "b.example.com", 2.5); const snap = drain(REQUEST_DURATION_NAME); - const flat = Object.fromEntries([...snap.entries()].map(([k, v]) => [k[0], v])); + const flat = Object.fromEntries([...snap.entries()].map(([k, v]) => [k[1], v])); expect(flat["a.example.com"]).toBeCloseTo(1.25); expect(flat["b.example.com"]).toBeCloseTo(2.5); }); it("drain() empties the bucket atomically — second drain returns an empty map", () => { - recordSuccess("a.example.com"); + recordSuccess("otlp", "a.example.com"); expect(drain(REQUEST_SUCCESS_NAME).size).toBe(1); expect(drain(REQUEST_SUCCESS_NAME).size).toBe(0); }); it("_resetAllForTest() clears every bucket", () => { - recordSuccess("a.example.com"); - recordFailure("a.example.com", 500); - recordDuration("a.example.com", 1.0); + recordSuccess("otlp", "a.example.com"); + recordFailure("otlp", "a.example.com", 500); + recordDuration("otlp", "a.example.com", 1.0); _resetAllForTest(); for (const name of NETWORK_METRIC_NAMES) { expect(drain(name).size).toBe(0); diff --git a/test/internal/unit/sdkstats/otlpWrapper.test.ts b/test/internal/unit/sdkstats/otlpWrapper.test.ts index 1a93e06..017c0fe 100644 --- a/test/internal/unit/sdkstats/otlpWrapper.test.ts +++ b/test/internal/unit/sdkstats/otlpWrapper.test.ts @@ -22,10 +22,14 @@ import { drain, } from "../../../../src/sdkstats/networkStats.js"; -const HOST = "collector.example.com"; +// `shortHost("https://collector.example.com:4318")` strips the first +// path component, so the dimension value the wrappers record is just +// "collector". `endpoint` is the category label ("otlp"). +const HOST = "collector"; +const ENDPOINT = "otlp"; function setEndpointEnv(): void { - process.env.OTEL_EXPORTER_OTLP_ENDPOINT = `https://${HOST}:4318`; + process.env.OTEL_EXPORTER_OTLP_ENDPOINT = `https://collector.example.com:4318`; } function clearEndpointEnv(): void { @@ -85,9 +89,9 @@ describe("sdkstats/otlpWrapper", () => { expect(inner.exported).toBe(1); const success = drain(REQUEST_SUCCESS_NAME); - expect([...success.entries()]).toEqual([[[HOST], 1]]); + expect([...success.entries()]).toEqual([[[ENDPOINT, HOST], 1]]); const dur = drain(REQUEST_DURATION_NAME); - expect([...dur.keys()][0]).toEqual([HOST]); + expect([...dur.keys()][0]).toEqual([ENDPOINT, HOST]); }); it("records failure(0) + duration on FAILED result (no HTTP status code surfaced)", async () => { @@ -96,7 +100,7 @@ describe("sdkstats/otlpWrapper", () => { await new Promise((resolve) => wrapper.export([], () => resolve())); const failure = drain(REQUEST_FAILURE_NAME); - expect([...failure.entries()]).toEqual([[[HOST, "0"], 1]]); + expect([...failure.entries()]).toEqual([[[ENDPOINT, HOST, "0"], 1]]); }); it("records exception + duration and re-throws on a synchronous throw", async () => { @@ -105,7 +109,7 @@ describe("sdkstats/otlpWrapper", () => { expect(() => wrapper.export([], () => {})).toThrow(TypeError); const exc = drain(REQUEST_EXCEPTION_NAME); - expect([...exc.entries()]).toEqual([[[HOST, "TypeError"], 1]]); + expect([...exc.entries()]).toEqual([[[ENDPOINT, HOST, "TypeError"], 1]]); const dur = drain(REQUEST_DURATION_NAME); expect(dur.size).toBe(1); }); @@ -147,7 +151,7 @@ describe("sdkstats/otlpWrapper", () => { await new Promise((resolve) => wrapper.export({} as ResourceMetrics, () => resolve()), ); - expect([...drain(REQUEST_SUCCESS_NAME).entries()]).toEqual([[[HOST], 1]]); + expect([...drain(REQUEST_SUCCESS_NAME).entries()]).toEqual([[[ENDPOINT, HOST], 1]]); }); it("forwards selectAggregationTemporality only when inner provides it", () => { @@ -189,7 +193,7 @@ describe("sdkstats/otlpWrapper", () => { makeLogExporter({ code: ExportResultCode.SUCCESS }), ); await new Promise((resolve) => wrapper.export([], () => resolve())); - expect([...drain(REQUEST_SUCCESS_NAME).entries()]).toEqual([[[HOST], 1]]); + expect([...drain(REQUEST_SUCCESS_NAME).entries()]).toEqual([[[ENDPOINT, HOST], 1]]); }); it("records failure(0) on FAILED result", async () => { @@ -197,7 +201,7 @@ describe("sdkstats/otlpWrapper", () => { makeLogExporter({ code: ExportResultCode.FAILED }), ); await new Promise((resolve) => wrapper.export([], () => resolve())); - expect([...drain(REQUEST_FAILURE_NAME).entries()]).toEqual([[[HOST, "0"], 1]]); + expect([...drain(REQUEST_FAILURE_NAME).entries()]).toEqual([[[ENDPOINT, HOST, "0"], 1]]); }); }); @@ -210,7 +214,7 @@ describe("sdkstats/otlpWrapper", () => { return new Promise((resolve) => wrapper.export([], () => { const success = drain(REQUEST_SUCCESS_NAME); - expect([...success.keys()][0]).toEqual(["unknown"]); + expect([...success.keys()][0]).toEqual([ENDPOINT, "unknown"]); resolve(); }), ); From 8d0d1df8e9176054d21f3630e1a21728cf841b08 Mon Sep 17 00:00:00 2001 From: Jackson Weber Date: Sat, 16 May 2026 23:37:18 -0700 Subject: [PATCH 03/14] style: prettier format network sdkstats sources + tests Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/a365/exporter/Agent365Exporter.ts | 4 +-- src/sdkstats/manager.ts | 4 +-- src/sdkstats/metrics.ts | 1 - src/sdkstats/networkStats.ts | 28 ++++--------------- .../unit/a365/agent365NetworkStats.test.ts | 5 ++-- test/internal/unit/otlp/handler.test.ts | 10 +++---- .../unit/sdkstats/networkStats.test.ts | 4 +-- .../unit/sdkstats/otlpWrapper.test.ts | 4 +-- 8 files changed, 16 insertions(+), 44 deletions(-) diff --git a/src/a365/exporter/Agent365Exporter.ts b/src/a365/exporter/Agent365Exporter.ts index 1908f7a..0bf7f11 100644 --- a/src/a365/exporter/Agent365Exporter.ts +++ b/src/a365/exporter/Agent365Exporter.ts @@ -341,9 +341,7 @@ export class Agent365Exporter implements SpanExporter { recordException( endpointCategory, host, - error instanceof Error - ? error.name || error.constructor.name || "Error" - : typeof error, + error instanceof Error ? error.name || error.constructor.name || "Error" : typeof error, ); } this.logger.error("[Agent365Exporter] Request error:", error); diff --git a/src/sdkstats/manager.ts b/src/sdkstats/manager.ts index 47cf6a9..95a07b0 100644 --- a/src/sdkstats/manager.ts +++ b/src/sdkstats/manager.ts @@ -117,9 +117,7 @@ export class SdkStatsManager { * already initialized), `false` if SDKStats are disabled via env var * or initialization failed. */ - async initialize( - options: { networkOnly?: boolean; cikey?: string } = {}, - ): Promise { + async initialize(options: { networkOnly?: boolean; cikey?: string } = {}): Promise { if (!isSdkStatsEnabled()) { return false; } diff --git a/src/sdkstats/metrics.ts b/src/sdkstats/metrics.ts index 2f29be4..cd8b9a5 100644 --- a/src/sdkstats/metrics.ts +++ b/src/sdkstats/metrics.ts @@ -230,4 +230,3 @@ export class SdkStatsMetrics { }; } } - diff --git a/src/sdkstats/networkStats.ts b/src/sdkstats/networkStats.ts index 474d96f..f7fec3f 100644 --- a/src/sdkstats/networkStats.ts +++ b/src/sdkstats/networkStats.ts @@ -64,9 +64,7 @@ export type NetworkMetricName = (typeof NETWORK_METRIC_NAMES)[number]; * * @internal */ -export type NetworkKey = - | readonly [string, string] - | readonly [string, string, string]; +export type NetworkKey = readonly [string, string] | readonly [string, string, string]; // Single-threaded JS execution → no lock needed (Python uses one because of // the GIL + threads; Node.js doesn't share JS objects across worker threads). @@ -104,19 +102,11 @@ export function recordSuccess(endpoint: string, host: string): void { bump(REQUEST_SUCCESS_NAME, [endpoint, host]); } -export function recordFailure( - endpoint: string, - host: string, - statusCode: number | string, -): void { +export function recordFailure(endpoint: string, host: string, statusCode: number | string): void { bump(REQUEST_FAILURE_NAME, [endpoint, host, String(statusCode)]); } -export function recordRetry( - endpoint: string, - host: string, - statusCode: number | string, -): void { +export function recordRetry(endpoint: string, host: string, statusCode: number | string): void { bump(REQUEST_RETRY_NAME, [endpoint, host, String(statusCode)]); } @@ -128,19 +118,11 @@ export function recordThrottle( bump(REQUEST_THROTTLE_NAME, [endpoint, host, String(statusCode)]); } -export function recordException( - endpoint: string, - host: string, - exceptionType: string, -): void { +export function recordException(endpoint: string, host: string, exceptionType: string): void { bump(REQUEST_EXCEPTION_NAME, [endpoint, host, exceptionType]); } -export function recordDuration( - endpoint: string, - host: string, - durationSeconds: number, -): void { +export function recordDuration(endpoint: string, host: string, durationSeconds: number): void { bump(REQUEST_DURATION_NAME, [endpoint, host], durationSeconds); } diff --git a/test/internal/unit/a365/agent365NetworkStats.test.ts b/test/internal/unit/a365/agent365NetworkStats.test.ts index d1415b6..8bc4ec2 100644 --- a/test/internal/unit/a365/agent365NetworkStats.test.ts +++ b/test/internal/unit/a365/agent365NetworkStats.test.ts @@ -61,7 +61,8 @@ function fetchHost(): string { // Whatever URL `Agent365Exporter` POSTs to in the default config — we // pluck it from the captured fetch args and pass through the same // shortHost() transform the production code uses. - const calls = (globalThis.fetch as unknown as { mock?: { calls: unknown[][] } }).mock?.calls ?? []; + const calls = + (globalThis.fetch as unknown as { mock?: { calls: unknown[][] } }).mock?.calls ?? []; if (calls.length === 0) return "unknown"; const url = calls[0][0] as string; return _shortHost(url); @@ -99,7 +100,7 @@ describe("Agent365Exporter network statsbeat", () => { expect([...drain(REQUEST_SUCCESS_NAME).entries()]).toEqual([[[ENDPOINT, host], 1]]); const dur = drain(REQUEST_DURATION_NAME); expect([...dur.keys()][0]).toEqual([ENDPOINT, host]); - expect((dur.get([ENDPOINT, host]) ?? [...dur.values()][0])).toBeGreaterThanOrEqual(0); + expect(dur.get([ENDPOINT, host]) ?? [...dur.values()][0]).toBeGreaterThanOrEqual(0); }); it("records request_retry_count for every retryable response and a final request_failure_count when retries are exhausted", async () => { diff --git a/test/internal/unit/otlp/handler.test.ts b/test/internal/unit/otlp/handler.test.ts index 30a8946..d827cbe 100644 --- a/test/internal/unit/otlp/handler.test.ts +++ b/test/internal/unit/otlp/handler.test.ts @@ -76,12 +76,10 @@ describe("OTLP Handler", () => { const components = createOtlpComponents(); const spanInner = (components.spanProcessor as unknown as { _exporter: unknown })._exporter; - const metricInner = ( - components.metricReader as unknown as { _exporter: unknown } - )._exporter; - const logInner = ( - components.logRecordProcessor as unknown as { _exporter: unknown } - )._exporter; + const metricInner = (components.metricReader as unknown as { _exporter: unknown }) + ._exporter; + const logInner = (components.logRecordProcessor as unknown as { _exporter: unknown }) + ._exporter; expect(spanInner?.constructor.name).toBe("NetworkStatsSpanExporter"); expect(metricInner?.constructor.name).toBe("NetworkStatsMetricExporter"); diff --git a/test/internal/unit/sdkstats/networkStats.test.ts b/test/internal/unit/sdkstats/networkStats.test.ts index 25d6a9e..bb45a57 100644 --- a/test/internal/unit/sdkstats/networkStats.test.ts +++ b/test/internal/unit/sdkstats/networkStats.test.ts @@ -82,9 +82,7 @@ describe("sdkstats/networkStats", () => { expect([...throttles.keys()][0]).toEqual(["otlp", "a.example.com", "402"]); const exceptions = drain(REQUEST_EXCEPTION_NAME); - expect([...exceptions.entries()]).toEqual([ - [["otlp", "a.example.com", "AbortError"], 2], - ]); + expect([...exceptions.entries()]).toEqual([[["otlp", "a.example.com", "AbortError"], 2]]); }); it("accumulates duration as a sum of seconds", () => { diff --git a/test/internal/unit/sdkstats/otlpWrapper.test.ts b/test/internal/unit/sdkstats/otlpWrapper.test.ts index 017c0fe..34c6709 100644 --- a/test/internal/unit/sdkstats/otlpWrapper.test.ts +++ b/test/internal/unit/sdkstats/otlpWrapper.test.ts @@ -148,9 +148,7 @@ describe("sdkstats/otlpWrapper", () => { const wrapper = new NetworkStatsMetricExporter( makeMetricExporter({ code: ExportResultCode.SUCCESS }), ); - await new Promise((resolve) => - wrapper.export({} as ResourceMetrics, () => resolve()), - ); + await new Promise((resolve) => wrapper.export({} as ResourceMetrics, () => resolve())); expect([...drain(REQUEST_SUCCESS_NAME).entries()]).toEqual([[[ENDPOINT, HOST], 1]]); }); From dfd8936b29645e9ddfec0e081e4f11f5e6bef15d Mon Sep 17 00:00:00 2001 From: Jackson Weber Date: Sat, 16 May 2026 23:48:29 -0700 Subject: [PATCH 04/14] fix(sdkstats): omit cikey dimension for non-AzMon customers If the customer isn't exporting to an Application Insights resource (OTLP-only / Console-only), there's no meaningful customer iKey to attach to SDKStats observations. Tagging with an empty string is worse than dropping the dimension entirely. Now: SdkStatsMetrics omits the cikey attribute when undefined/empty; distro passes undefined when no connection string is configured. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/distro/distro.ts | 16 ++++++++-------- src/sdkstats/manager.ts | 5 ++++- src/sdkstats/metrics.ts | 15 +++++++++------ test/internal/unit/sdkstats/metrics.test.ts | 2 +- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/distro/distro.ts b/src/distro/distro.ts index 84f68af..d29526f 100644 --- a/src/distro/distro.ts +++ b/src/distro/distro.ts @@ -392,17 +392,17 @@ export function useMicrosoftOpenTelemetry(options?: MicrosoftOpenTelemetryOption // full set (feature + instrumentation + network) and ships them to // the well-known statsbeat endpoint. // - // `cikey` is reported as a customDimension on every observation per - // the SDKStats spec. Parse it from the customer connection string if - // we have one; empty string otherwise. + // `cikey` is reported as a customDimension on every SDKStats + // observation per the spec, but ONLY when the customer is exporting + // to an Application Insights resource. For OTLP-only / Console-only + // customers we leave it undefined so the dimension is omitted + // entirely rather than tagged with an empty / meaningless value. const sdkStatsCikey = (() => { const cs = config.azureMonitorExporterOptions?.connectionString ?? - process.env["APPLICATIONINSIGHTS_CONNECTION_STRING"] ?? - ""; - if (!cs) return ""; - const parsed = ConnectionStringParser.parse(cs); - return parsed.instrumentationkey ?? ""; + process.env["APPLICATIONINSIGHTS_CONNECTION_STRING"]; + if (!cs) return undefined; + return ConnectionStringParser.parse(cs).instrumentationkey || undefined; })(); void SdkStatsManager.getInstance().initialize({ networkOnly: azureMonitorEnabled, diff --git a/src/sdkstats/manager.ts b/src/sdkstats/manager.ts index 95a07b0..7281917 100644 --- a/src/sdkstats/manager.ts +++ b/src/sdkstats/manager.ts @@ -111,7 +111,10 @@ export class SdkStatsManager { * `AZURE_MONITOR_STATSBEAT_FEATURES`). * @param options.cikey Customer iKey to report as the `cikey` * customDimension on every observation. Required by the SDKStats - * spec; pass an empty string only if no customer iKey is available. + * spec. Omit (or pass undefined) when the customer is not exporting + * to an Application Insights resource — the dimension is then + * dropped from the observation entirely rather than emitted as an + * empty string. * * Returns `true` if the standalone pipeline was initialized (or was * already initialized), `false` if SDKStats are disabled via env var diff --git a/src/sdkstats/metrics.ts b/src/sdkstats/metrics.ts index cd8b9a5..417c340 100644 --- a/src/sdkstats/metrics.ts +++ b/src/sdkstats/metrics.ts @@ -110,7 +110,8 @@ export interface SdkStatsMetricsOptions { /** * Customer instrumentation key emitted as the `cikey` customDimension * on every SDKStats observation, per the Application Insights SDKStats - * spec. Pass an empty string when no customer iKey is available. + * spec. Omitted entirely when undefined or empty (e.g. for OTLP-only + * customers without an Application Insights connection string). */ cikey?: string; /** @@ -137,22 +138,24 @@ export class SdkStatsMetrics { private readonly commonAttributes: Record; constructor(meterProvider: MeterProvider, options: SdkStatsMetricsOptions = {}) { - const { distroVersion, networkOnly = false, cikey = "" } = options; + const { distroVersion, networkOnly = false, cikey } = options; const meter = meterProvider.getMeter("microsoft.opentelemetry.sdkstats"); // Per spec/sdkstats.md the required customDimensions on every - // SDKStats observation are: rp, attach, cikey, runtimeVersion, os, + // SDKStats observation are: rp, attach, runtimeVersion, os, // language, version (plus endpoint/host on network gauges and - // statusCode/exceptionType where applicable). Missing dimensions - // cause envelopes to be silently dropped on the backend. + // statusCode/exceptionType where applicable). `cikey` is only + // meaningful when the customer is exporting to an Application + // Insights resource; omit it entirely for OTLP-only / Console-only + // customers rather than emitting an empty string. this.commonAttributes = { rp: "unknown", attach: "Manual", - cikey, runtimeVersion: process.version, os: os.type(), language: STATSBEAT_LANGUAGE, version: distroVersion || MICROSOFT_OPENTELEMETRY_VERSION, + ...(cikey ? { cikey } : {}), }; // Feature / instrumentation bitmask gauges are skipped when running diff --git a/test/internal/unit/sdkstats/metrics.test.ts b/test/internal/unit/sdkstats/metrics.test.ts index 132fdc2..02dc149 100644 --- a/test/internal/unit/sdkstats/metrics.test.ts +++ b/test/internal/unit/sdkstats/metrics.test.ts @@ -269,7 +269,7 @@ describe("sdkstats/metrics", () => { for (const dp of [...success, ...failure, ...retry, ...throttle, ...exception, ...duration]) { expect(dp.attributes.rp).toBe("unknown"); expect(dp.attributes.attach).toBe("Manual"); - expect(dp.attributes.cikey).toBe(""); + expect(dp.attributes.cikey).toBeUndefined(); expect(dp.attributes.language).toBe("node"); } From f47236174f3bf95ae0df0c445d8c8c306dc68618 Mon Sep 17 00:00:00 2001 From: Jackson Weber Date: Wed, 20 May 2026 10:42:05 -0700 Subject: [PATCH 05/14] refactor(sdkstats): scope network statsbeat to Request_Success_Count only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduce the initial PR scope to only the Request_Success_Count network statsbeat metric. Remove failure, retry, throttle, exception, and duration counters — those will be added in follow-up PRs. - networkStats.ts: keep only recordSuccess, shortHost, drain - otlpWrapper.ts: simplified wrapExport to only record success - Agent365Exporter.ts: remove all record* calls except recordSuccess - metrics.ts: single network gauge spec (success count only) - Updated all tests to match reduced scope Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/a365/exporter/Agent365Exporter.ts | 41 +--------- src/distro/distro.ts | 2 +- src/sdkstats/index.ts | 11 --- src/sdkstats/manager.ts | 8 +- src/sdkstats/metrics.ts | 49 +----------- src/sdkstats/networkStats.ts | 74 +++---------------- src/sdkstats/otlpWrapper.ts | 43 ++--------- .../unit/a365/agent365NetworkStats.test.ts | 71 ++---------------- test/internal/unit/sdkstats/metrics.test.ts | 46 +----------- .../unit/sdkstats/networkStats.test.ts | 66 +---------------- .../unit/sdkstats/otlpWrapper.test.ts | 42 ++--------- 11 files changed, 43 insertions(+), 410 deletions(-) diff --git a/src/a365/exporter/Agent365Exporter.ts b/src/a365/exporter/Agent365Exporter.ts index 0bf7f11..3f7cc50 100644 --- a/src/a365/exporter/Agent365Exporter.ts +++ b/src/a365/exporter/Agent365Exporter.ts @@ -21,17 +21,7 @@ import { chunkBySize, } from "./utils.js"; import { getA365Logger } from "../logging.js"; -import { - THROTTLE_STATUS_CODES, - isSdkStatsEnabled, - recordDuration, - recordException, - recordFailure, - recordRetry, - recordSuccess, - recordThrottle, - shortHost, -} from "../../sdkstats/index.js"; +import { isSdkStatsEnabled, recordSuccess, shortHost } from "../../sdkstats/index.js"; const DEFAULT_MAX_RETRIES = 3; @@ -274,7 +264,6 @@ export class Agent365Exporter implements SpanExporter { } for (let attempt = 0; attempt <= DEFAULT_MAX_RETRIES; attempt++) { - const startTime = Date.now(); try { const response = await fetch(url, { method: "POST", @@ -283,10 +272,6 @@ export class Agent365Exporter implements SpanExporter { signal: AbortSignal.timeout(this.options.httpRequestTimeoutMilliseconds), }); - if (recordA365Stats) { - recordDuration(endpointCategory, host, (Date.now() - startTime) / 1000); - } - const correlationId = response.headers.get("x-ms-correlation-id") ?? response.headers.get("x-correlation-id") ?? @@ -305,11 +290,6 @@ export class Agent365Exporter implements SpanExporter { [408, 429].includes(response.status) || (response.status >= 500 && response.status < 600) ) { - if (recordA365Stats) { - // 402 (throttle) is not in the retryable set, so it never - // lands here — only true retries. - recordRetry(endpointCategory, host, response.status); - } if (attempt < DEFAULT_MAX_RETRIES) { const sleepMs = 200 * (attempt + 1) + Math.floor(Math.random() * 100); this.logger.warn( @@ -318,17 +298,6 @@ export class Agent365Exporter implements SpanExporter { await sleep(sleepMs); continue; } - // Retries exhausted: also record a final failure so dashboards - // see this as a terminal failure (not just a retry blip). - if (recordA365Stats) { - recordFailure(endpointCategory, host, response.status); - } - } else if (recordA365Stats) { - if (THROTTLE_STATUS_CODES.has(response.status)) { - recordThrottle(endpointCategory, host, response.status); - } else { - recordFailure(endpointCategory, host, response.status); - } } this.logger.error( @@ -336,14 +305,6 @@ export class Agent365Exporter implements SpanExporter { ); return { ok: false, correlationId }; } catch (error) { - if (recordA365Stats) { - recordDuration(endpointCategory, host, (Date.now() - startTime) / 1000); - recordException( - endpointCategory, - host, - error instanceof Error ? error.name || error.constructor.name || "Error" : typeof error, - ); - } this.logger.error("[Agent365Exporter] Request error:", error); if (attempt < DEFAULT_MAX_RETRIES) { await sleep(200 * (attempt + 1)); diff --git a/src/distro/distro.ts b/src/distro/distro.ts index d29526f..5d23386 100644 --- a/src/distro/distro.ts +++ b/src/distro/distro.ts @@ -379,7 +379,7 @@ export function useMicrosoftOpenTelemetry(options?: MicrosoftOpenTelemetryOption // ── SDKStats: standalone pipeline ───────────────────────────────── // The standalone pipeline ALWAYS runs so per-export network statsbeat - // (`Request_*` etc. gauges) for A365 / OTLP transmits is captured. + // (`Request_Success_Count` gauge) for A365 / OTLP transmits is captured. // // - When Azure Monitor is enabled (`networkOnly: true`): only the // network gauges are registered. The Feature / Feature.instrumentations diff --git a/src/sdkstats/index.ts b/src/sdkstats/index.ts index e8d8d84..f123d66 100644 --- a/src/sdkstats/index.ts +++ b/src/sdkstats/index.ts @@ -35,20 +35,9 @@ export type { SdkStatsMetricsOptions } from "./metrics.js"; export { SdkStatsManager } from "./manager.js"; export { - THROTTLE_STATUS_CODES, REQUEST_SUCCESS_NAME, - REQUEST_FAILURE_NAME, - REQUEST_RETRY_NAME, - REQUEST_THROTTLE_NAME, - REQUEST_EXCEPTION_NAME, - REQUEST_DURATION_NAME, NETWORK_METRIC_NAMES, recordSuccess, - recordFailure, - recordRetry, - recordThrottle, - recordException, - recordDuration, drain, shortHost, _resetAllForTest as _resetNetworkStatsForTest, diff --git a/src/sdkstats/manager.ts b/src/sdkstats/manager.ts index 7281917..ff33f4e 100644 --- a/src/sdkstats/manager.ts +++ b/src/sdkstats/manager.ts @@ -37,9 +37,9 @@ import { SdkStatsMetrics } from "./metrics.js"; * `azure.monitor.opentelemetry.exporter.statsbeat._utils`). * * The pipeline emits both Feature/Feature.instrumentations gauges - * (when not in `networkOnly` mode) and the six `request_*` network - * gauges; the network counters dominate cadence requirements, so the - * single shared interval defaults to short rather than long. + * (when not in `networkOnly` mode) and the `Request_Success_Count` + * network gauge; the network counter dominates cadence requirements, + * so the single shared interval defaults to short rather than long. * * @internal */ @@ -104,7 +104,7 @@ export class SdkStatsManager { * Set up SDKStats export via the Azure Monitor statsbeat endpoint. * * @param options.networkOnly When `true`, the {@link SdkStatsMetrics} - * instance only registers the six network gauges and skips the + * instance only registers the network gauge(s) and skips the * feature/instrumentation gauges. Used on the Azure-Monitor-enabled * path because the AzMon exporter's own long-interval statsbeat * already emits those gauges (with our distro bits bridged in via diff --git a/src/sdkstats/metrics.ts b/src/sdkstats/metrics.ts index 417c340..853f1ee 100644 --- a/src/sdkstats/metrics.ts +++ b/src/sdkstats/metrics.ts @@ -20,12 +20,7 @@ import { MICROSOFT_OPENTELEMETRY_VERSION } from "../types.js"; import { getSdkStatsFeatureFlags, getSdkStatsInstrumentationFlags } from "./state.js"; import { NETWORK_METRIC_NAMES, - REQUEST_DURATION_NAME, - REQUEST_EXCEPTION_NAME, - REQUEST_FAILURE_NAME, - REQUEST_RETRY_NAME, REQUEST_SUCCESS_NAME, - REQUEST_THROTTLE_NAME, drain, type NetworkMetricName, } from "./networkStats.js"; @@ -44,15 +39,10 @@ const INSTRUMENTATION_METRIC_NAME = "Feature.instrumentations"; const STATSBEAT_LANGUAGE = "node"; /** - * Per-metric configuration for the six network statsbeat gauges. - * - * - `secondAttr` — name of the additional dimension (`statusCode` or - * `exceptionType`) reported alongside `endpoint`. `undefined` means the - * metric is keyed on `endpoint` only. + * Per-metric configuration for the network statsbeat gauges. */ interface NetworkGaugeSpec { metric: NetworkMetricName; - secondAttr?: "statusCode" | "exceptionType"; unit: string; description: string; } @@ -63,35 +53,6 @@ const NETWORK_GAUGE_SPECS: readonly NetworkGaugeSpec[] = [ unit: "count", description: "Number of successful HTTP exports per endpoint", }, - { - metric: REQUEST_FAILURE_NAME, - secondAttr: "statusCode", - unit: "count", - description: "Number of failed HTTP exports per endpoint and status code", - }, - { - metric: REQUEST_RETRY_NAME, - secondAttr: "statusCode", - unit: "count", - description: "Number of retried HTTP exports per endpoint and status code", - }, - { - metric: REQUEST_THROTTLE_NAME, - secondAttr: "statusCode", - unit: "count", - description: "Number of throttled HTTP exports per endpoint and status code", - }, - { - metric: REQUEST_EXCEPTION_NAME, - secondAttr: "exceptionType", - unit: "count", - description: "Number of HTTP exports that raised an exception, per endpoint and exception type", - }, - { - metric: REQUEST_DURATION_NAME, - unit: "s", - description: "Cumulative HTTP export duration per endpoint", - }, ]; // Sanity check at module load — keeps NETWORK_GAUGE_SPECS in sync with @@ -121,7 +82,7 @@ export interface SdkStatsMetricsOptions { * bits bridged in via `AZURE_MONITOR_STATSBEAT_FEATURES`); registering * them here would double-count. * - * The six network statsbeat gauges (`Request_*` etc.) are always + * The network statsbeat gauge (`Request_Success_Count`) is always * registered regardless of this flag — coexistence with AzMon's own * network statsbeat is safe because the (endpoint, host) attributes * partition the time series. @@ -217,17 +178,11 @@ export class SdkStatsMetrics { private makeNetworkCallback(spec: NetworkGaugeSpec): (result: ObservableResult) => void { return (result: ObservableResult): void => { for (const [key, value] of drain(spec.metric)) { - // Key layout (from networkStats.ts): - // [endpoint, host] → success / duration - // [endpoint, host, statusCode|exceptionType] → others const attrs: Record = { ...this.commonAttributes, endpoint: key[0], host: key[1], }; - if (spec.secondAttr && key.length === 3) { - attrs[spec.secondAttr] = key[2]; - } result.observe(value, attrs); } }; diff --git a/src/sdkstats/networkStats.ts b/src/sdkstats/networkStats.ts index f7fec3f..94a6528 100644 --- a/src/sdkstats/networkStats.ts +++ b/src/sdkstats/networkStats.ts @@ -4,23 +4,15 @@ /** * Network statsbeat accumulator for SDK self-telemetry. * - * Per-export success / failure / retry / throttle / exception counts and - * cumulative request duration for telemetry exporters. Exporters call the - * `record*` functions after each transmit; the {@link SdkStatsMetrics} - * observable-gauge callbacks drain the accumulated counts on each export - * interval. + * Per-export success counts for telemetry exporters. Exporters call + * {@link recordSuccess} after each successful transmit; the + * {@link SdkStatsMetrics} observable-gauge callbacks drain the + * accumulated counts on each export interval. * * Mirrors `src/microsoft/opentelemetry/_sdkstats/_utils.py` from the Python * distro (microsoft/opentelemetry-distro-python#144). */ -/** - * HTTP status codes treated as throttling for SDKStats purposes. - * - * @internal - */ -export const THROTTLE_STATUS_CODES: ReadonlySet = new Set([402]); - // Metric names must match the AzMon statsbeat backend's recognized // schema (see `StatsbeatCounter` enum in // `@azure/monitor-opentelemetry-exporter/dist/esm/export/statsbeat/types.js`). @@ -29,25 +21,13 @@ export const THROTTLE_STATUS_CODES: ReadonlySet = new Set([402]); // dashboards. The constants below intentionally match the wire-format // names — do NOT rename them to lowercase. export const REQUEST_SUCCESS_NAME = "Request_Success_Count"; -export const REQUEST_FAILURE_NAME = "Request_Failure_Count"; -export const REQUEST_RETRY_NAME = "Retry_Count"; -export const REQUEST_THROTTLE_NAME = "Throttle_Count"; -export const REQUEST_EXCEPTION_NAME = "Exception_Count"; -export const REQUEST_DURATION_NAME = "Request_Duration"; /** - * Names of all six network statsbeat metrics, in registration order. + * Names of registered network statsbeat metrics, in registration order. * * @internal */ -export const NETWORK_METRIC_NAMES = [ - REQUEST_SUCCESS_NAME, - REQUEST_FAILURE_NAME, - REQUEST_RETRY_NAME, - REQUEST_THROTTLE_NAME, - REQUEST_EXCEPTION_NAME, - REQUEST_DURATION_NAME, -] as const; +export const NETWORK_METRIC_NAMES = [REQUEST_SUCCESS_NAME] as const; export type NetworkMetricName = (typeof NETWORK_METRIC_NAMES)[number]; @@ -55,26 +35,17 @@ export type NetworkMetricName = (typeof NETWORK_METRIC_NAMES)[number]; * Composite key for an aggregated network statsbeat counter. * * Per the Application Insights SDKStats spec the per-key dimensions are - * `endpoint` (category, e.g. "otlp", "a365"), `host` (stamp-specific - * region or hostname), and optionally `statusCode` / `exceptionType`. - * - * - 2-element tuples key on `[endpoint, host]` (success / duration). - * - 3-element tuples key on `[endpoint, host, statusCode | exceptionType]` - * (failure / retry / throttle / exception). + * `endpoint` (category, e.g. "otlp", "a365") and `host` (stamp-specific + * region or hostname). * * @internal */ -export type NetworkKey = readonly [string, string] | readonly [string, string, string]; +export type NetworkKey = readonly [string, string]; // Single-threaded JS execution → no lock needed (Python uses one because of // the GIL + threads; Node.js doesn't share JS objects across worker threads). const REQUESTS_MAP: Record> = { [REQUEST_SUCCESS_NAME]: new Map(), - [REQUEST_FAILURE_NAME]: new Map(), - [REQUEST_RETRY_NAME]: new Map(), - [REQUEST_THROTTLE_NAME]: new Map(), - [REQUEST_EXCEPTION_NAME]: new Map(), - [REQUEST_DURATION_NAME]: new Map(), }; // `Map` keys are compared by identity for arrays/objects, so we serialize @@ -88,8 +59,7 @@ function encodeKey(key: NetworkKey): string { function decodeKey(encoded: string): NetworkKey { const parts = encoded.split(KEY_SEPARATOR); - if (parts.length === 2) return [parts[0], parts[1]] as const; - return [parts[0], parts[1], parts[2]] as const; + return [parts[0], parts[1]] as const; } function bump(metric: NetworkMetricName, key: NetworkKey, value = 1): void { @@ -102,30 +72,6 @@ export function recordSuccess(endpoint: string, host: string): void { bump(REQUEST_SUCCESS_NAME, [endpoint, host]); } -export function recordFailure(endpoint: string, host: string, statusCode: number | string): void { - bump(REQUEST_FAILURE_NAME, [endpoint, host, String(statusCode)]); -} - -export function recordRetry(endpoint: string, host: string, statusCode: number | string): void { - bump(REQUEST_RETRY_NAME, [endpoint, host, String(statusCode)]); -} - -export function recordThrottle( - endpoint: string, - host: string, - statusCode: number | string = 402, -): void { - bump(REQUEST_THROTTLE_NAME, [endpoint, host, String(statusCode)]); -} - -export function recordException(endpoint: string, host: string, exceptionType: string): void { - bump(REQUEST_EXCEPTION_NAME, [endpoint, host, exceptionType]); -} - -export function recordDuration(endpoint: string, host: string, durationSeconds: number): void { - bump(REQUEST_DURATION_NAME, [endpoint, host], durationSeconds); -} - /** * Compute the stamp-specific short host for the SDKStats `host` dimension. * diff --git a/src/sdkstats/otlpWrapper.ts b/src/sdkstats/otlpWrapper.ts index f465e1b..872c4fe 100644 --- a/src/sdkstats/otlpWrapper.ts +++ b/src/sdkstats/otlpWrapper.ts @@ -7,7 +7,7 @@ * The upstream OTLP HTTP exporters do not surface HTTP status codes — only * the {@link ExportResult} enum and any raised exception. The decorators * here capture that signal so the network statsbeat pipeline can record - * success / failure / exception / duration counts per endpoint. + * success counts per endpoint. * * Mirrors `src/microsoft/opentelemetry/_sdkstats/_otlp_wrapper.py` from the * Python distro (microsoft/opentelemetry-distro-python#144). @@ -25,13 +25,7 @@ import type { import type { ReadableSpan, SpanExporter } from "@opentelemetry/sdk-trace-base"; import type { LogRecordExporter, ReadableLogRecord } from "@opentelemetry/sdk-logs"; -import { - recordDuration, - recordException, - recordFailure, - recordSuccess, - shortHost, -} from "./networkStats.js"; +import { recordSuccess, shortHost } from "./networkStats.js"; /** Per spec, `endpoint` is a category label, not the destination URL. */ const OTLP_ENDPOINT_CATEGORY = "otlp"; @@ -61,12 +55,8 @@ function resolveShortHost(signal: "traces" | "metrics" | "logs"): string { /** * Common bookkeeping for an export attempt. * - * The OTel JS exporter contract is callback-based, not promise-based, and - * the HTTP exporters surface no status code — only an {@link ExportResult}. - * On `ExportResultCode.SUCCESS` we record a success; otherwise we record - * failure with a placeholder `statusCode=0` (matching the Python distro). - * Synchronous throws and async-completed errors are both recorded as - * exceptions keyed by the error class name. + * On `ExportResultCode.SUCCESS` we record a success count. Other outcomes + * (failure, exception, duration) will be added in a future PR. */ function wrapExport( host: string, @@ -74,37 +64,14 @@ function wrapExport( resultCallback: (result: ExportResult) => void, _items: T, ): void { - const start = Date.now(); - let settled = false; const settle = (result: ExportResult): void => { - if (settled) return; - settled = true; - recordDuration(OTLP_ENDPOINT_CATEGORY, host, (Date.now() - start) / 1000); if (result.code === ExportResultCode.SUCCESS) { recordSuccess(OTLP_ENDPOINT_CATEGORY, host); - } else { - // The HTTP exporters don't expose an HTTP status code, so record - // failures with statusCode=0 (matches Python distro). - recordFailure(OTLP_ENDPOINT_CATEGORY, host, 0); } resultCallback(result); }; - try { - inner(settle); - } catch (err) { - settled = true; - recordDuration(OTLP_ENDPOINT_CATEGORY, host, (Date.now() - start) / 1000); - recordException(OTLP_ENDPOINT_CATEGORY, host, errorName(err)); - throw err; - } -} - -function errorName(err: unknown): string { - if (err instanceof Error) { - return err.name || err.constructor.name || "Error"; - } - return typeof err; + inner(settle); } /** diff --git a/test/internal/unit/a365/agent365NetworkStats.test.ts b/test/internal/unit/a365/agent365NetworkStats.test.ts index 8bc4ec2..429e73c 100644 --- a/test/internal/unit/a365/agent365NetworkStats.test.ts +++ b/test/internal/unit/a365/agent365NetworkStats.test.ts @@ -7,15 +7,9 @@ import type { ReadableSpan } from "@opentelemetry/sdk-trace-base"; import { Agent365Exporter } from "../../../../src/a365/exporter/Agent365Exporter.js"; import { - REQUEST_DURATION_NAME, - REQUEST_EXCEPTION_NAME, - REQUEST_FAILURE_NAME, - REQUEST_RETRY_NAME, REQUEST_SUCCESS_NAME, - REQUEST_THROTTLE_NAME, _resetAllForTest, drain, - shortHost as _shortHost, } from "../../../../src/sdkstats/networkStats.js"; import { _resetA365LoggerForTest } from "../../../../src/a365/logging.js"; @@ -57,19 +51,6 @@ function exportSpan(exporter: Agent365Exporter): Promise { return new Promise((resolve) => exporter.export([makeSpan()], (r) => resolve(r.code))); } -function fetchHost(): string { - // Whatever URL `Agent365Exporter` POSTs to in the default config — we - // pluck it from the captured fetch args and pass through the same - // shortHost() transform the production code uses. - const calls = - (globalThis.fetch as unknown as { mock?: { calls: unknown[][] } }).mock?.calls ?? []; - if (calls.length === 0) return "unknown"; - const url = calls[0][0] as string; - return _shortHost(url); -} - -const ENDPOINT = "a365"; - describe("Agent365Exporter network statsbeat", () => { let fetchSpy: ReturnType; @@ -87,7 +68,7 @@ describe("Agent365Exporter network statsbeat", () => { vi.restoreAllMocks(); }); - it("records request_success_count + request_duration on a 2xx response", async () => { + it("records request_success_count on a 2xx response", async () => { fetchSpy.mockResolvedValue({ status: 200, headers: new Map([["x-ms-correlation-id", "c1"]]), @@ -96,16 +77,12 @@ describe("Agent365Exporter network statsbeat", () => { const exporter = new Agent365Exporter({ tokenResolver: () => "tok" }); await exportSpan(exporter); - const host = fetchHost(); - expect([...drain(REQUEST_SUCCESS_NAME).entries()]).toEqual([[[ENDPOINT, host], 1]]); - const dur = drain(REQUEST_DURATION_NAME); - expect([...dur.keys()][0]).toEqual([ENDPOINT, host]); - expect(dur.get([ENDPOINT, host]) ?? [...dur.values()][0]).toBeGreaterThanOrEqual(0); + expect(drain(REQUEST_SUCCESS_NAME).size).toBe(1); }); - it("records request_retry_count for every retryable response and a final request_failure_count when retries are exhausted", async () => { + it("does not record success on non-2xx responses", async () => { fetchSpy.mockResolvedValue({ status: 503, headers: new Map() }); - // Speed up retries — postWithRetries does 1 initial + 3 retries = 4 attempts. + // Speed up retries vi.spyOn(globalThis, "setTimeout").mockImplementation(((cb: () => void) => { cb(); return 0 as unknown as NodeJS.Timeout; @@ -114,38 +91,11 @@ describe("Agent365Exporter network statsbeat", () => { const exporter = new Agent365Exporter({ tokenResolver: () => "tok" }); await exportSpan(exporter); - const host = fetchHost(); - const retries = drain(REQUEST_RETRY_NAME); - expect([...retries.entries()]).toEqual([[[ENDPOINT, host, "503"], 4]]); - const failures = drain(REQUEST_FAILURE_NAME); - expect([...failures.entries()]).toEqual([[[ENDPOINT, host, "503"], 1]]); - }); - - it("records request_failure_count for non-retryable, non-throttle status codes", async () => { - fetchSpy.mockResolvedValue({ status: 404, headers: new Map() }); - const exporter = new Agent365Exporter({ tokenResolver: () => "tok" }); - await exportSpan(exporter); - - const host = fetchHost(); - expect([...drain(REQUEST_FAILURE_NAME).entries()]).toEqual([[[ENDPOINT, host, "404"], 1]]); - expect(drain(REQUEST_RETRY_NAME).size).toBe(0); - }); - - it("records request_throttle_count on HTTP 402", async () => { - fetchSpy.mockResolvedValue({ status: 402, headers: new Map() }); - const exporter = new Agent365Exporter({ tokenResolver: () => "tok" }); - await exportSpan(exporter); - - const host = fetchHost(); - expect([...drain(REQUEST_THROTTLE_NAME).entries()]).toEqual([[[ENDPOINT, host, "402"], 1]]); - expect(drain(REQUEST_FAILURE_NAME).size).toBe(0); + expect(drain(REQUEST_SUCCESS_NAME).size).toBe(0); }); - it("records request_exception_count + duration when fetch rejects, on every retry", async () => { - class AbortError extends Error { - override name = "AbortError"; - } - fetchSpy.mockRejectedValue(new AbortError("aborted")); + it("does not record success on fetch rejection", async () => { + fetchSpy.mockRejectedValue(new Error("network error")); vi.spyOn(globalThis, "setTimeout").mockImplementation(((cb: () => void) => { cb(); return 0 as unknown as NodeJS.Timeout; @@ -154,11 +104,7 @@ describe("Agent365Exporter network statsbeat", () => { const exporter = new Agent365Exporter({ tokenResolver: () => "tok" }); await exportSpan(exporter); - const host = fetchHost(); - const exceptions = drain(REQUEST_EXCEPTION_NAME); - expect([...exceptions.entries()]).toEqual([[[ENDPOINT, host, "AbortError"], 4]]); - const durations = drain(REQUEST_DURATION_NAME); - expect([...durations.keys()][0]).toEqual([ENDPOINT, host]); + expect(drain(REQUEST_SUCCESS_NAME).size).toBe(0); }); it("records nothing when MICROSOFT_OTEL_SDKSTATS_DISABLED=true", async () => { @@ -172,6 +118,5 @@ describe("Agent365Exporter network statsbeat", () => { await exportSpan(exporter); expect(drain(REQUEST_SUCCESS_NAME).size).toBe(0); - expect(drain(REQUEST_DURATION_NAME).size).toBe(0); }); }); diff --git a/test/internal/unit/sdkstats/metrics.test.ts b/test/internal/unit/sdkstats/metrics.test.ts index 02dc149..6396c98 100644 --- a/test/internal/unit/sdkstats/metrics.test.ts +++ b/test/internal/unit/sdkstats/metrics.test.ts @@ -5,19 +5,9 @@ import { describe, it, beforeEach, expect } from "vitest"; import { MeterProvider } from "@opentelemetry/sdk-metrics"; import { - REQUEST_DURATION_NAME, - REQUEST_EXCEPTION_NAME, - REQUEST_FAILURE_NAME, - REQUEST_RETRY_NAME, REQUEST_SUCCESS_NAME, - REQUEST_THROTTLE_NAME, _resetAllForTest as _resetNetworkStatsForTest, - recordDuration, - recordException, - recordFailure, - recordRetry, recordSuccess, - recordThrottle, } from "../../../../src/sdkstats/networkStats.js"; import { FEATURE_TYPE_FEATURE, @@ -203,16 +193,10 @@ describe("sdkstats/metrics", () => { }); describe("network gauges (default mode)", () => { - it("emits one observation per drained key, attaches endpoint + host + statusCode/exceptionType, and clears after collection", async () => { + it("emits one observation per drained key, attaches endpoint + host, and clears after collection", async () => { _resetNetworkStatsForTest(); recordSuccess("a365", "a365.example.com"); recordSuccess("a365", "a365.example.com"); - recordFailure("a365", "a365.example.com", 503); - recordRetry("a365", "a365.example.com", 503); - recordRetry("a365", "a365.example.com", 503); - recordThrottle("otlp", "otlp.example.com", 402); - recordException("otlp", "otlp.example.com", "AbortError"); - recordDuration("a365", "a365.example.com", 1.25); const { PeriodicExportingMetricReader } = await import("@opentelemetry/sdk-metrics"); const exporter = new InMemoryMetricExporter(AggregationTemporality.CUMULATIVE); @@ -239,34 +223,8 @@ describe("sdkstats/metrics", () => { expect(success[0].attributes.host).toBe("a365.example.com"); expect(success[0].attributes.statusCode).toBeUndefined(); - const failure = byName(REQUEST_FAILURE_NAME); - expect(failure).toHaveLength(1); - expect(failure[0].value).toBe(1); - expect(failure[0].attributes.endpoint).toBe("a365"); - expect(failure[0].attributes.host).toBe("a365.example.com"); - expect(failure[0].attributes.statusCode).toBe("503"); - - const retry = byName(REQUEST_RETRY_NAME); - expect(retry).toHaveLength(1); - expect(retry[0].value).toBe(2); - expect(retry[0].attributes.statusCode).toBe("503"); - - const throttle = byName(REQUEST_THROTTLE_NAME); - expect(throttle).toHaveLength(1); - expect(throttle[0].attributes.endpoint).toBe("otlp"); - expect(throttle[0].attributes.host).toBe("otlp.example.com"); - expect(throttle[0].attributes.statusCode).toBe("402"); - - const exception = byName(REQUEST_EXCEPTION_NAME); - expect(exception).toHaveLength(1); - expect(exception[0].attributes.exceptionType).toBe("AbortError"); - - const duration = byName(REQUEST_DURATION_NAME); - expect(duration).toHaveLength(1); - expect(duration[0].value).toBeCloseTo(1.25); - // Common dimensions per spec. - for (const dp of [...success, ...failure, ...retry, ...throttle, ...exception, ...duration]) { + for (const dp of success) { expect(dp.attributes.rp).toBe("unknown"); expect(dp.attributes.attach).toBe("Manual"); expect(dp.attributes.cikey).toBeUndefined(); diff --git a/test/internal/unit/sdkstats/networkStats.test.ts b/test/internal/unit/sdkstats/networkStats.test.ts index bb45a57..1b45632 100644 --- a/test/internal/unit/sdkstats/networkStats.test.ts +++ b/test/internal/unit/sdkstats/networkStats.test.ts @@ -5,21 +5,10 @@ import { beforeEach, describe, expect, it } from "vitest"; import { NETWORK_METRIC_NAMES, - REQUEST_DURATION_NAME, - REQUEST_EXCEPTION_NAME, - REQUEST_FAILURE_NAME, - REQUEST_RETRY_NAME, REQUEST_SUCCESS_NAME, - REQUEST_THROTTLE_NAME, - THROTTLE_STATUS_CODES, _resetAllForTest, drain, - recordDuration, - recordException, - recordFailure, - recordRetry, recordSuccess, - recordThrottle, } from "../../../../src/sdkstats/networkStats.js"; describe("sdkstats/networkStats", () => { @@ -27,18 +16,9 @@ describe("sdkstats/networkStats", () => { _resetAllForTest(); }); - it("exposes 6 metric names matching the Python distro", () => { - expect(NETWORK_METRIC_NAMES).toEqual([ - REQUEST_SUCCESS_NAME, - REQUEST_FAILURE_NAME, - REQUEST_RETRY_NAME, - REQUEST_THROTTLE_NAME, - REQUEST_EXCEPTION_NAME, - REQUEST_DURATION_NAME, - ]); + it("exposes the Request_Success_Count metric name", () => { + expect(NETWORK_METRIC_NAMES).toEqual([REQUEST_SUCCESS_NAME]); expect(REQUEST_SUCCESS_NAME).toBe("Request_Success_Count"); - expect(REQUEST_DURATION_NAME).toBe("Request_Duration"); - expect(THROTTLE_STATUS_CODES.has(402)).toBe(true); }); it("accumulates success counts per (endpoint, host) and reports keys as two-element tuples", () => { @@ -55,46 +35,6 @@ describe("sdkstats/networkStats", () => { expect(entries[1][1]).toBe(1); }); - it("keys failure/retry/throttle/exception by [endpoint, host, second-attr]", () => { - recordFailure("otlp", "a.example.com", 503); - recordFailure("otlp", "a.example.com", 503); - recordFailure("otlp", "a.example.com", 502); - recordRetry("otlp", "a.example.com", 429); - recordThrottle("otlp", "a.example.com"); - recordException("otlp", "a.example.com", "AbortError"); - recordException("otlp", "a.example.com", "AbortError"); - - const failures = drain(REQUEST_FAILURE_NAME); - const flat = [...failures.entries()].map(([k, v]) => [k.join("|"), v] as const); - expect(flat).toEqual( - expect.arrayContaining([ - ["otlp|a.example.com|503", 2], - ["otlp|a.example.com|502", 1], - ]), - ); - - const retries = drain(REQUEST_RETRY_NAME); - expect([...retries.values()]).toEqual([1]); - const [retryKey] = [...retries.keys()]; - expect(retryKey).toEqual(["otlp", "a.example.com", "429"]); - - const throttles = drain(REQUEST_THROTTLE_NAME); - expect([...throttles.keys()][0]).toEqual(["otlp", "a.example.com", "402"]); - - const exceptions = drain(REQUEST_EXCEPTION_NAME); - expect([...exceptions.entries()]).toEqual([[["otlp", "a.example.com", "AbortError"], 2]]); - }); - - it("accumulates duration as a sum of seconds", () => { - recordDuration("otlp", "a.example.com", 0.25); - recordDuration("otlp", "a.example.com", 1.0); - recordDuration("otlp", "b.example.com", 2.5); - const snap = drain(REQUEST_DURATION_NAME); - const flat = Object.fromEntries([...snap.entries()].map(([k, v]) => [k[1], v])); - expect(flat["a.example.com"]).toBeCloseTo(1.25); - expect(flat["b.example.com"]).toBeCloseTo(2.5); - }); - it("drain() empties the bucket atomically — second drain returns an empty map", () => { recordSuccess("otlp", "a.example.com"); expect(drain(REQUEST_SUCCESS_NAME).size).toBe(1); @@ -103,8 +43,6 @@ describe("sdkstats/networkStats", () => { it("_resetAllForTest() clears every bucket", () => { recordSuccess("otlp", "a.example.com"); - recordFailure("otlp", "a.example.com", 500); - recordDuration("otlp", "a.example.com", 1.0); _resetAllForTest(); for (const name of NETWORK_METRIC_NAMES) { expect(drain(name).size).toBe(0); diff --git a/test/internal/unit/sdkstats/otlpWrapper.test.ts b/test/internal/unit/sdkstats/otlpWrapper.test.ts index 34c6709..3d5dfa1 100644 --- a/test/internal/unit/sdkstats/otlpWrapper.test.ts +++ b/test/internal/unit/sdkstats/otlpWrapper.test.ts @@ -14,9 +14,6 @@ import { NetworkStatsSpanExporter, } from "../../../../src/sdkstats/otlpWrapper.js"; import { - REQUEST_DURATION_NAME, - REQUEST_EXCEPTION_NAME, - REQUEST_FAILURE_NAME, REQUEST_SUCCESS_NAME, _resetAllForTest, drain, @@ -39,20 +36,11 @@ function clearEndpointEnv(): void { delete process.env.OTEL_EXPORTER_OTLP_LOGS_ENDPOINT; } -function makeFakeSpanExporter( - result: ExportResult | "throw" | Error, -): SpanExporter & { exported: number } { +function makeFakeSpanExporter(result: ExportResult): SpanExporter & { exported: number } { return { exported: 0, export(_spans: ReadableSpan[], cb: (r: ExportResult) => void): void { this.exported++; - if (result === "throw") { - throw new TypeError("boom"); - } - if (result instanceof Error) { - cb({ code: ExportResultCode.FAILED, error: result }); - return; - } cb(result); }, shutdown(): Promise { @@ -76,7 +64,7 @@ describe("sdkstats/otlpWrapper", () => { }); describe("NetworkStatsSpanExporter", () => { - it("records success + duration on SUCCESS", async () => { + it("records success on SUCCESS", async () => { const inner = makeFakeSpanExporter({ code: ExportResultCode.SUCCESS }); const wrapper = new NetworkStatsSpanExporter(inner); @@ -90,28 +78,14 @@ describe("sdkstats/otlpWrapper", () => { const success = drain(REQUEST_SUCCESS_NAME); expect([...success.entries()]).toEqual([[[ENDPOINT, HOST], 1]]); - const dur = drain(REQUEST_DURATION_NAME); - expect([...dur.keys()][0]).toEqual([ENDPOINT, HOST]); }); - it("records failure(0) + duration on FAILED result (no HTTP status code surfaced)", async () => { + it("does not record success on FAILED result", async () => { const inner = makeFakeSpanExporter({ code: ExportResultCode.FAILED }); const wrapper = new NetworkStatsSpanExporter(inner); await new Promise((resolve) => wrapper.export([], () => resolve())); - const failure = drain(REQUEST_FAILURE_NAME); - expect([...failure.entries()]).toEqual([[[ENDPOINT, HOST, "0"], 1]]); - }); - - it("records exception + duration and re-throws on a synchronous throw", async () => { - const inner = makeFakeSpanExporter("throw"); - const wrapper = new NetworkStatsSpanExporter(inner); - - expect(() => wrapper.export([], () => {})).toThrow(TypeError); - const exc = drain(REQUEST_EXCEPTION_NAME); - expect([...exc.entries()]).toEqual([[[ENDPOINT, HOST, "TypeError"], 1]]); - const dur = drain(REQUEST_DURATION_NAME); - expect(dur.size).toBe(1); + expect(drain(REQUEST_SUCCESS_NAME).size).toBe(0); }); it("forwards forceFlush and shutdown", async () => { @@ -144,7 +118,7 @@ describe("sdkstats/otlpWrapper", () => { }; } - it("records success + duration", async () => { + it("records success on SUCCESS", async () => { const wrapper = new NetworkStatsMetricExporter( makeMetricExporter({ code: ExportResultCode.SUCCESS }), ); @@ -186,7 +160,7 @@ describe("sdkstats/otlpWrapper", () => { }; } - it("records success + duration on SUCCESS", async () => { + it("records success on SUCCESS", async () => { const wrapper = new NetworkStatsLogExporter( makeLogExporter({ code: ExportResultCode.SUCCESS }), ); @@ -194,12 +168,12 @@ describe("sdkstats/otlpWrapper", () => { expect([...drain(REQUEST_SUCCESS_NAME).entries()]).toEqual([[[ENDPOINT, HOST], 1]]); }); - it("records failure(0) on FAILED result", async () => { + it("does not record success on FAILED result", async () => { const wrapper = new NetworkStatsLogExporter( makeLogExporter({ code: ExportResultCode.FAILED }), ); await new Promise((resolve) => wrapper.export([], () => resolve())); - expect([...drain(REQUEST_FAILURE_NAME).entries()]).toEqual([[[ENDPOINT, HOST, "0"], 1]]); + expect(drain(REQUEST_SUCCESS_NAME).size).toBe(0); }); }); From 1b85bce75e58f959708283685760cc522242d1e6 Mon Sep 17 00:00:00 2001 From: Jackson Weber Date: Wed, 20 May 2026 12:51:36 -0700 Subject: [PATCH 06/14] fix(sdkstats): restore dual-interval pipeline (24h long + 15min short) Use two separate MeterProviders so Feature/Feature.instrumentations gauges export at the original 24-hour long interval while network statsbeat (Request_Success_Count) exports at the 15-minute short interval. Restores APPLICATIONINSIGHTS_STATS_LONG_EXPORT_INTERVAL env var override. SdkStatsMetrics now accepts separate long/short MeterProviders with a backward-compatible single-provider overload for tests. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/sdkstats/manager.ts | 105 +++++++++++++++----- src/sdkstats/metrics.ts | 66 ++++++++++-- test/internal/unit/sdkstats/manager.test.ts | 44 ++++++-- 3 files changed, 170 insertions(+), 45 deletions(-) diff --git a/src/sdkstats/manager.ts b/src/sdkstats/manager.ts index ff33f4e..a21abc5 100644 --- a/src/sdkstats/manager.ts +++ b/src/sdkstats/manager.ts @@ -30,23 +30,32 @@ import { isSdkStatsEnabled, setSdkStatsShutdown } from "./state.js"; import { SdkStatsMetrics } from "./metrics.js"; /** - * Default short export interval (15 minutes) for the standalone SDKStats - * pipeline. This matches the Application Insights statsbeat - * short-interval cadence used by the network statsbeat counters and the - * Python distro (`_get_stats_short_export_interval()` in - * `azure.monitor.opentelemetry.exporter.statsbeat._utils`). + * Default long export interval (24 hours) for Feature/Instrumentation + * SDKStats per the Application Insights SDKStats specification. * - * The pipeline emits both Feature/Feature.instrumentations gauges - * (when not in `networkOnly` mode) and the `Request_Success_Count` - * network gauge; the network counter dominates cadence requirements, - * so the single shared interval defaults to short rather than long. + * @internal + */ +const DEFAULT_LONG_EXPORT_INTERVAL_MS = 24 * 60 * 60 * 1000; + +/** + * Override env var: long export interval in seconds, per the spec. + * + * @internal + */ +const SDKSTATS_LONG_EXPORT_INTERVAL_ENV = "APPLICATIONINSIGHTS_STATS_LONG_EXPORT_INTERVAL"; + +/** + * Default short export interval (15 minutes) for network statsbeat + * counters. Matches the Application Insights statsbeat short-interval + * cadence used by the Python distro (`_get_stats_short_export_interval()` + * in `azure.monitor.opentelemetry.exporter.statsbeat._utils`). * * @internal */ const DEFAULT_SHORT_EXPORT_INTERVAL_MS = 15 * 60 * 1000; /** - * Override env var: standalone SDKStats export interval in seconds. + * Override env var: short (network) export interval in seconds. * Matches the Python distro env var name. * * @internal @@ -65,7 +74,7 @@ const SDKSTATS_SHORT_EXPORT_INTERVAL_ENV = "APPLICATIONINSIGHTS_STATS_SHORT_EXPO const SDKSTATS_CONNECTION_STRING_ENV = "APPLICATIONINSIGHTS_STATS_CONNECTION_STRING"; /** - * Initial-export delay (15 seconds) before the first flush. + * Initial-export delay (15 seconds) before the first long-interval flush. * * The spec recommends this delay specifically for the Node.js SDK to * avoid short-running CLI-style applications generating excess SDKStats @@ -84,7 +93,8 @@ const INITIAL_EXPORT_DELAY_MS = 15 * 1000; export class SdkStatsManager { private static _instance: SdkStatsManager | undefined; - private _meterProvider: MeterProvider | undefined; + private _longMeterProvider: MeterProvider | undefined; + private _shortMeterProvider: MeterProvider | undefined; private _metrics: SdkStatsMetrics | undefined; private _initialized = false; private _initialExportTimer: NodeJS.Timeout | undefined; @@ -103,6 +113,11 @@ export class SdkStatsManager { /** * Set up SDKStats export via the Azure Monitor statsbeat endpoint. * + * Two separate MeterProviders are created so that Feature / + * Feature.instrumentations gauges export on the long interval (24h) + * while network statsbeat gauges (`Request_Success_Count`) export on + * the short interval (15 min). + * * @param options.networkOnly When `true`, the {@link SdkStatsMetrics} * instance only registers the network gauge(s) and skips the * feature/instrumentation gauges. Used on the Azure-Monitor-enabled @@ -161,21 +176,42 @@ export class SdkStatsManager { const connectionString = process.env[SDKSTATS_CONNECTION_STRING_ENV] ?? NON_EU_CONNECTION_STRING; - const exporter = new AzureMonitorStatsbeatExporter({ + const emptyResource = resourceFromAttributes({}); + + // Long-interval pipeline (24h) — Feature / Feature.instrumentations. + // Skipped when `networkOnly` is true (AzMon exporter owns those). + if (!options.networkOnly) { + const longExporter = new AzureMonitorStatsbeatExporter({ + connectionString, + disableOfflineStorage: true, + }); + const longReader = new PeriodicExportingMetricReader({ + exporter: longExporter, + exportIntervalMillis: resolveLongExportInterval(), + }); + this._longMeterProvider = new MeterProvider({ + readers: [longReader], + resource: emptyResource, + }); + } + + // Short-interval pipeline (15 min) — network statsbeat gauges. + const shortExporter = new AzureMonitorStatsbeatExporter({ connectionString, disableOfflineStorage: true, }); - - const reader = new PeriodicExportingMetricReader({ - exporter, - exportIntervalMillis: resolveExportInterval(), + const shortReader = new PeriodicExportingMetricReader({ + exporter: shortExporter, + exportIntervalMillis: resolveShortExportInterval(), }); - - this._meterProvider = new MeterProvider({ - readers: [reader], - resource: resourceFromAttributes({}), + this._shortMeterProvider = new MeterProvider({ + readers: [shortReader], + resource: emptyResource, }); - this._metrics = new SdkStatsMetrics(this._meterProvider, { + + this._metrics = new SdkStatsMetrics({ + longMeterProvider: this._longMeterProvider, + shortMeterProvider: this._shortMeterProvider, networkOnly: options.networkOnly, cikey: options.cikey, }); @@ -188,7 +224,10 @@ export class SdkStatsManager { // excess startup traffic. `unref()` so the timer never blocks // process shutdown. this._initialExportTimer = setTimeout(() => { - this._meterProvider?.forceFlush().catch((err) => { + Promise.all([ + this._longMeterProvider?.forceFlush(), + this._shortMeterProvider?.forceFlush(), + ]).catch((err) => { Logger.getInstance().debug("[SDKStats] Initial forceFlush failed.", err); }); }, INITIAL_EXPORT_DELAY_MS); @@ -215,7 +254,10 @@ export class SdkStatsManager { this._initialExportTimer = undefined; } try { - await this._meterProvider?.shutdown(); + await Promise.all([ + this._longMeterProvider?.shutdown(), + this._shortMeterProvider?.shutdown(), + ]); } catch (error) { Logger.getInstance().debug("[SDKStats] Error shutting down standalone pipeline.", error); } finally { @@ -226,7 +268,8 @@ export class SdkStatsManager { } private _cleanup(): void { - this._meterProvider = undefined; + this._longMeterProvider = undefined; + this._shortMeterProvider = undefined; this._metrics = undefined; this._initialized = false; if (this._initialExportTimer) { @@ -244,7 +287,17 @@ export class SdkStatsManager { } } -function resolveExportInterval(): number { +function resolveLongExportInterval(): number { + const raw = process.env[SDKSTATS_LONG_EXPORT_INTERVAL_ENV]; + if (!raw) return DEFAULT_LONG_EXPORT_INTERVAL_MS; + const seconds = Number(raw); + if (!Number.isFinite(seconds) || seconds <= 0) { + return DEFAULT_LONG_EXPORT_INTERVAL_MS; + } + return Math.floor(seconds * 1000); +} + +function resolveShortExportInterval(): number { const raw = process.env[SDKSTATS_SHORT_EXPORT_INTERVAL_ENV]; if (!raw) return DEFAULT_SHORT_EXPORT_INTERVAL_MS; const seconds = Number(raw); diff --git a/src/sdkstats/metrics.ts b/src/sdkstats/metrics.ts index 853f1ee..817cf47 100644 --- a/src/sdkstats/metrics.ts +++ b/src/sdkstats/metrics.ts @@ -88,6 +88,19 @@ export interface SdkStatsMetricsOptions { * partition the time series. */ networkOnly?: boolean; + /** + * MeterProvider for long-interval gauges (Feature / + * Feature.instrumentations). May be undefined when `networkOnly` is + * true. When provided, gauges are registered on a meter from this + * provider so they export at the long (24h) cadence. + */ + longMeterProvider?: MeterProvider; + /** + * MeterProvider for short-interval gauges (network statsbeat like + * `Request_Success_Count`). Gauges are registered on a meter from + * this provider so they export at the short (15 min) cadence. + */ + shortMeterProvider: MeterProvider; } /** @@ -98,9 +111,37 @@ export interface SdkStatsMetricsOptions { export class SdkStatsMetrics { private readonly commonAttributes: Record; - constructor(meterProvider: MeterProvider, options: SdkStatsMetricsOptions = {}) { - const { distroVersion, networkOnly = false, cikey } = options; - const meter = meterProvider.getMeter("microsoft.opentelemetry.sdkstats"); + constructor(options: SdkStatsMetricsOptions); + /** @deprecated Use the options-object overload instead. */ + constructor( + meterProvider: MeterProvider, + options?: Omit, + ); + constructor( + providerOrOptions: MeterProvider | SdkStatsMetricsOptions, + legacyOptions?: Omit, + ) { + let longMeterProvider: MeterProvider | undefined; + let shortMeterProvider: MeterProvider; + let distroVersion: string | undefined; + let networkOnly: boolean; + let cikey: string | undefined; + + if ("shortMeterProvider" in providerOrOptions) { + // New options-object overload + longMeterProvider = providerOrOptions.longMeterProvider; + shortMeterProvider = providerOrOptions.shortMeterProvider; + distroVersion = providerOrOptions.distroVersion; + networkOnly = providerOrOptions.networkOnly ?? false; + cikey = providerOrOptions.cikey; + } else { + // Legacy single-provider overload (used by tests) + longMeterProvider = providerOrOptions; + shortMeterProvider = providerOrOptions; + distroVersion = legacyOptions?.distroVersion; + networkOnly = legacyOptions?.networkOnly ?? false; + cikey = legacyOptions?.cikey; + } // Per spec/sdkstats.md the required customDimensions on every // SDKStats observation are: rp, attach, runtimeVersion, os, @@ -123,23 +164,28 @@ export class SdkStatsMetrics { // alongside the Azure Monitor exporter's own statsbeat — that pipeline // already emits them (with our distro bits bridged in via // `_bridge_sdkstats_to_azure_monitor`) and would collide with these. - if (!networkOnly) { - const featureGauge = meter.createObservableGauge(FEATURE_METRIC_NAME, { + // These gauges are registered on the long-interval MeterProvider. + if (!networkOnly && longMeterProvider) { + const longMeter = longMeterProvider.getMeter("microsoft.opentelemetry.sdkstats"); + + const featureGauge = longMeter.createObservableGauge(FEATURE_METRIC_NAME, { description: "SDKStats metric tracking enabled features", }); featureGauge.addCallback(this.observeFeatures); - const instrumentationGauge = meter.createObservableGauge(INSTRUMENTATION_METRIC_NAME, { + const instrumentationGauge = longMeter.createObservableGauge(INSTRUMENTATION_METRIC_NAME, { description: "SDKStats metric tracking enabled instrumentations", }); instrumentationGauge.addCallback(this.observeInstrumentations); } - // Network statsbeat gauges — always registered. Each callback drains - // the counts accumulated by exporters between observations and emits - // one Observation per (endpoint[, second-attr]) tuple. + // Network statsbeat gauges — always registered on the short-interval + // MeterProvider. Each callback drains the counts accumulated by + // exporters between observations and emits one Observation per + // (endpoint, host) tuple. + const shortMeter = shortMeterProvider.getMeter("microsoft.opentelemetry.sdkstats.network"); for (const spec of NETWORK_GAUGE_SPECS) { - const gauge = meter.createObservableGauge(spec.metric, { + const gauge = shortMeter.createObservableGauge(spec.metric, { unit: spec.unit, description: spec.description, }); diff --git a/test/internal/unit/sdkstats/manager.test.ts b/test/internal/unit/sdkstats/manager.test.ts index e8ee103..9909adf 100644 --- a/test/internal/unit/sdkstats/manager.test.ts +++ b/test/internal/unit/sdkstats/manager.test.ts @@ -9,6 +9,13 @@ import { _resetSdkStatsStateForTest, } from "../../../../src/sdkstats/state.js"; +function getReaderInterval(provider: unknown): number | undefined { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const p = provider as any; + const reader = p?._sharedState?.metricCollectors?.[0]?._metricReader; + return reader?._exportInterval; +} + describe("sdkstats/manager", () => { beforeEach(() => { _resetSdkStatsStateForTest(); @@ -55,15 +62,36 @@ describe("sdkstats/manager", () => { expect(await manager.shutdown()).toBe(false); }); - it("uses the spec-compliant 15-minute short-export interval by default", async () => { + it("uses a 24-hour long-interval for Feature gauges and 15-minute short-interval for network gauges", async () => { const manager = SdkStatsManager.getInstance(); await manager.initialize(); - // Reach into the private MeterProvider's reader to confirm interval. // eslint-disable-next-line @typescript-eslint/no-explicit-any - const provider = (manager as any)._meterProvider; - const reader = provider?._sharedState?.metricCollectors?.[0]?._metricReader; - const intervalMs = reader?._exportInterval; - expect(intervalMs).toBe(15 * 60 * 1000); + const longProvider = (manager as any)._longMeterProvider; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const shortProvider = (manager as any)._shortMeterProvider; + expect(getReaderInterval(longProvider)).toBe(24 * 60 * 60 * 1000); + expect(getReaderInterval(shortProvider)).toBe(15 * 60 * 1000); + }); + + it("skips the long-interval provider when networkOnly is true", async () => { + const manager = SdkStatsManager.getInstance(); + await manager.initialize({ networkOnly: true }); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + expect((manager as any)._longMeterProvider).toBeUndefined(); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + expect((manager as any)._shortMeterProvider).toBeDefined(); + }); + + it("honours APPLICATIONINSIGHTS_STATS_LONG_EXPORT_INTERVAL override (seconds)", async () => { + process.env["APPLICATIONINSIGHTS_STATS_LONG_EXPORT_INTERVAL"] = "3600"; + try { + const manager = SdkStatsManager.getInstance(); + await manager.initialize(); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + expect(getReaderInterval((manager as any)._longMeterProvider)).toBe(3_600_000); + } finally { + delete process.env["APPLICATIONINSIGHTS_STATS_LONG_EXPORT_INTERVAL"]; + } }); it("honours APPLICATIONINSIGHTS_STATS_SHORT_EXPORT_INTERVAL override (seconds)", async () => { @@ -72,9 +100,7 @@ describe("sdkstats/manager", () => { const manager = SdkStatsManager.getInstance(); await manager.initialize(); // eslint-disable-next-line @typescript-eslint/no-explicit-any - const provider = (manager as any)._meterProvider; - const reader = provider?._sharedState?.metricCollectors?.[0]?._metricReader; - expect(reader?._exportInterval).toBe(60_000); + expect(getReaderInterval((manager as any)._shortMeterProvider)).toBe(60_000); } finally { delete process.env["APPLICATIONINSIGHTS_STATS_SHORT_EXPORT_INTERVAL"]; } From c9e664bdeb293ab4b2ed3f6c392d6e29393c069b Mon Sep 17 00:00:00 2001 From: Jackson Weber Date: Wed, 20 May 2026 12:55:22 -0700 Subject: [PATCH 07/14] docs(sdkstats): clean up non-standard @param JSDoc in manager Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/sdkstats/manager.ts | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/sdkstats/manager.ts b/src/sdkstats/manager.ts index a21abc5..1d6bdfd 100644 --- a/src/sdkstats/manager.ts +++ b/src/sdkstats/manager.ts @@ -118,19 +118,6 @@ export class SdkStatsManager { * while network statsbeat gauges (`Request_Success_Count`) export on * the short interval (15 min). * - * @param options.networkOnly When `true`, the {@link SdkStatsMetrics} - * instance only registers the network gauge(s) and skips the - * feature/instrumentation gauges. Used on the Azure-Monitor-enabled - * path because the AzMon exporter's own long-interval statsbeat - * already emits those gauges (with our distro bits bridged in via - * `AZURE_MONITOR_STATSBEAT_FEATURES`). - * @param options.cikey Customer iKey to report as the `cikey` - * customDimension on every observation. Required by the SDKStats - * spec. Omit (or pass undefined) when the customer is not exporting - * to an Application Insights resource — the dimension is then - * dropped from the observation entirely rather than emitted as an - * empty string. - * * Returns `true` if the standalone pipeline was initialized (or was * already initialized), `false` if SDKStats are disabled via env var * or initialization failed. From 5214ca1062902e1a125d93b5d2fcad0e2ccffdb1 Mon Sep 17 00:00:00 2001 From: Jackson Weber Date: Wed, 20 May 2026 12:58:42 -0700 Subject: [PATCH 08/14] chore(sdkstats): remove istanbul pragma, sanity check, and redundant comment Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/sdkstats/manager.ts | 5 ----- src/sdkstats/metrics.ts | 8 -------- 2 files changed, 13 deletions(-) diff --git a/src/sdkstats/manager.ts b/src/sdkstats/manager.ts index 1d6bdfd..c57216b 100644 --- a/src/sdkstats/manager.ts +++ b/src/sdkstats/manager.ts @@ -113,11 +113,6 @@ export class SdkStatsManager { /** * Set up SDKStats export via the Azure Monitor statsbeat endpoint. * - * Two separate MeterProviders are created so that Feature / - * Feature.instrumentations gauges export on the long interval (24h) - * while network statsbeat gauges (`Request_Success_Count`) export on - * the short interval (15 min). - * * Returns `true` if the standalone pipeline was initialized (or was * already initialized), `false` if SDKStats are disabled via env var * or initialization failed. diff --git a/src/sdkstats/metrics.ts b/src/sdkstats/metrics.ts index 817cf47..5187f3c 100644 --- a/src/sdkstats/metrics.ts +++ b/src/sdkstats/metrics.ts @@ -19,7 +19,6 @@ import type { ObservableResult } from "@opentelemetry/api"; import { MICROSOFT_OPENTELEMETRY_VERSION } from "../types.js"; import { getSdkStatsFeatureFlags, getSdkStatsInstrumentationFlags } from "./state.js"; import { - NETWORK_METRIC_NAMES, REQUEST_SUCCESS_NAME, drain, type NetworkMetricName, @@ -55,13 +54,6 @@ const NETWORK_GAUGE_SPECS: readonly NetworkGaugeSpec[] = [ }, ]; -// Sanity check at module load — keeps NETWORK_GAUGE_SPECS in sync with -// NETWORK_METRIC_NAMES if either is edited. -/* istanbul ignore next */ -if (NETWORK_GAUGE_SPECS.length !== NETWORK_METRIC_NAMES.length) { - throw new Error("NETWORK_GAUGE_SPECS is out of sync with NETWORK_METRIC_NAMES"); -} - /** * Options for {@link SdkStatsMetrics}. */ From 848ae596288af5da53ad6992dc49e65761a498db Mon Sep 17 00:00:00 2001 From: Jackson Weber Date: Wed, 20 May 2026 15:51:28 -0700 Subject: [PATCH 09/14] style(sdkstats): fix prettier formatting in metrics.ts Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/sdkstats/metrics.ts | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/sdkstats/metrics.ts b/src/sdkstats/metrics.ts index 5187f3c..47daf4a 100644 --- a/src/sdkstats/metrics.ts +++ b/src/sdkstats/metrics.ts @@ -18,11 +18,7 @@ import type { ObservableResult } from "@opentelemetry/api"; import { MICROSOFT_OPENTELEMETRY_VERSION } from "../types.js"; import { getSdkStatsFeatureFlags, getSdkStatsInstrumentationFlags } from "./state.js"; -import { - REQUEST_SUCCESS_NAME, - drain, - type NetworkMetricName, -} from "./networkStats.js"; +import { REQUEST_SUCCESS_NAME, drain, type NetworkMetricName } from "./networkStats.js"; /** * Feature SDKStats `type` dimension values, per the Application Insights From c348045516a987b8c08e7475b3c038be9d4f2bfe Mon Sep 17 00:00:00 2001 From: Jackson Weber Date: Wed, 20 May 2026 16:00:07 -0700 Subject: [PATCH 10/14] fix(sdkstats): preserve hyphens in shortHost for non-Azure hostnames The regex character class [^/.-] excluded hyphens, causing hostnames like 'my-otlp-collector.example.com' to be truncated to 'my'. Changed to [^/.] so hyphens are preserved. Azure stamp suffixes (e.g. -1) are now stripped separately with a targeted /-\d+$/ replace. Added shortHost tests covering Azure URLs, localhost, plain hostnames, hyphenated hostnames, empty input, and non-URL strings. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/sdkstats/networkStats.ts | 5 +++- .../unit/sdkstats/networkStats.test.ts | 28 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/sdkstats/networkStats.ts b/src/sdkstats/networkStats.ts index 94a6528..e86d6a2 100644 --- a/src/sdkstats/networkStats.ts +++ b/src/sdkstats/networkStats.ts @@ -81,6 +81,7 @@ export function recordSuccess(endpoint: string, host: string): void { * `https://westus2-1.in.applicationinsights.azure.com` → `westus2` * `http://localhost:4318/v1/traces` → `localhost` * `https://collector.example.com:8080` → `collector` + * `https://my-otlp-collector.example.com` → `my-otlp-collector` * For non-URL inputs, returns the hostname or the raw input on failure. * * @internal @@ -89,7 +90,7 @@ export function shortHost(input: string): string { if (!input) return "unknown"; let host = input; try { - const hostRegex = /^https?:\/\/(?:www\.)?([^/.-]+)/; + const hostRegex = /^https?:\/\/(?:www\.)?([^/.]+)/; const res = hostRegex.exec(input); if (res && res.length > 1) { host = res[1]; @@ -101,6 +102,8 @@ export function shortHost(input: string): string { } } host = host.replace(".in.applicationinsights.azure.com", ""); + // Strip Azure stamp suffix (e.g. westus2-1 → westus2) + host = host.replace(/-\d+$/, ""); const colon = host.indexOf(":"); if (colon > 0) host = host.slice(0, colon); } catch { diff --git a/test/internal/unit/sdkstats/networkStats.test.ts b/test/internal/unit/sdkstats/networkStats.test.ts index 1b45632..7a74a53 100644 --- a/test/internal/unit/sdkstats/networkStats.test.ts +++ b/test/internal/unit/sdkstats/networkStats.test.ts @@ -9,6 +9,7 @@ import { _resetAllForTest, drain, recordSuccess, + shortHost, } from "../../../../src/sdkstats/networkStats.js"; describe("sdkstats/networkStats", () => { @@ -48,4 +49,31 @@ describe("sdkstats/networkStats", () => { expect(drain(name).size).toBe(0); } }); + + describe("shortHost", () => { + it("extracts the Azure region from an AzMon ingestion URL", () => { + expect(shortHost("https://westus2-1.in.applicationinsights.azure.com")).toBe("westus2"); + }); + + it("strips the port from localhost URLs", () => { + expect(shortHost("http://localhost:4318/v1/traces")).toBe("localhost"); + }); + + it("extracts the first label from a plain hostname URL", () => { + expect(shortHost("https://collector.example.com:8080")).toBe("collector"); + }); + + it("preserves hyphens in non-Azure hostnames", () => { + expect(shortHost("https://my-otlp-collector.example.com")).toBe("my-otlp-collector"); + expect(shortHost("https://otel-collector-prod.example.com:4318")).toBe("otel-collector-prod"); + }); + + it("returns 'unknown' for empty input", () => { + expect(shortHost("")).toBe("unknown"); + }); + + it("returns the raw input for non-URL strings", () => { + expect(shortHost("not-a-url")).toBe("not-a-url"); + }); + }); }); From 5326567d70639b84021f6409b815b20cd4a81c25 Mon Sep 17 00:00:00 2001 From: Jackson Weber Date: Wed, 20 May 2026 16:02:55 -0700 Subject: [PATCH 11/14] chore: remove NETWORK_SDKSTATS_PLAN.md and its gitignore entry Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .gitignore | 3 --- 1 file changed, 3 deletions(-) diff --git a/.gitignore b/.gitignore index 8e15469..d0b0f76 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,3 @@ npm-debug.log* # Temp tmp/ - -# Internal planning doc — keep out of the repo -NETWORK_SDKSTATS_PLAN.md From ae4320aee403031ea7a8686e7e43087f63409537 Mon Sep 17 00:00:00 2001 From: Jackson Weber Date: Fri, 22 May 2026 14:06:02 -0700 Subject: [PATCH 12/14] Emit cikey as 'N/A' when instrumentation key is unavailable Avoids backend KQL queries needing to filter out empty rows for OTLP-only, A365-only, or Console-only deployments. Spec update: aep-health-and-standards/Telemetry-Collection-Spec#930. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/sdkstats/metrics.ts | 15 +++++++-------- test/internal/unit/sdkstats/metrics.test.ts | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/sdkstats/metrics.ts b/src/sdkstats/metrics.ts index 47daf4a..0ef9c1e 100644 --- a/src/sdkstats/metrics.ts +++ b/src/sdkstats/metrics.ts @@ -59,9 +59,8 @@ export interface SdkStatsMetricsOptions { /** * Customer instrumentation key emitted as the `cikey` customDimension * on every SDKStats observation, per the Application Insights SDKStats - * spec. Omitted entirely when undefined or empty (e.g. for OTLP-only + * spec. Reported as `"N/A"` when undefined or empty (e.g. for OTLP-only * customers without an Application Insights connection string). - */ cikey?: string; /** * When `true`, skip the Feature / Feature.instrumentations gauges. Used @@ -133,11 +132,11 @@ export class SdkStatsMetrics { // Per spec/sdkstats.md the required customDimensions on every // SDKStats observation are: rp, attach, runtimeVersion, os, - // language, version (plus endpoint/host on network gauges and - // statusCode/exceptionType where applicable). `cikey` is only - // meaningful when the customer is exporting to an Application - // Insights resource; omit it entirely for OTLP-only / Console-only - // customers rather than emitting an empty string. + // language, version, cikey (plus endpoint/host on network gauges and + // statusCode/exceptionType where applicable). `cikey` is reported as + // "N/A" when the customer is not exporting to an Application Insights + // resource (e.g. OTLP-only / Console-only), so backend KQL queries + // don't have to filter out missing rows. this.commonAttributes = { rp: "unknown", attach: "Manual", @@ -145,7 +144,7 @@ export class SdkStatsMetrics { os: os.type(), language: STATSBEAT_LANGUAGE, version: distroVersion || MICROSOFT_OPENTELEMETRY_VERSION, - ...(cikey ? { cikey } : {}), + cikey: cikey || "N/A", }; // Feature / instrumentation bitmask gauges are skipped when running diff --git a/test/internal/unit/sdkstats/metrics.test.ts b/test/internal/unit/sdkstats/metrics.test.ts index 6396c98..04b0278 100644 --- a/test/internal/unit/sdkstats/metrics.test.ts +++ b/test/internal/unit/sdkstats/metrics.test.ts @@ -227,7 +227,7 @@ describe("sdkstats/metrics", () => { for (const dp of success) { expect(dp.attributes.rp).toBe("unknown"); expect(dp.attributes.attach).toBe("Manual"); - expect(dp.attributes.cikey).toBeUndefined(); + expect(dp.attributes.cikey).toBe("N/A"); expect(dp.attributes.language).toBe("node"); } From b5735260ed80f4f345c32c20e399c8fc3f95b0af Mon Sep 17 00:00:00 2001 From: Jackson Weber Date: Fri, 22 May 2026 14:10:29 -0700 Subject: [PATCH 13/14] Fix unterminated JSDoc on cikey option Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/sdkstats/metrics.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sdkstats/metrics.ts b/src/sdkstats/metrics.ts index 0ef9c1e..9e6fddc 100644 --- a/src/sdkstats/metrics.ts +++ b/src/sdkstats/metrics.ts @@ -61,6 +61,7 @@ export interface SdkStatsMetricsOptions { * on every SDKStats observation, per the Application Insights SDKStats * spec. Reported as `"N/A"` when undefined or empty (e.g. for OTLP-only * customers without an Application Insights connection string). + */ cikey?: string; /** * When `true`, skip the Feature / Feature.instrumentations gauges. Used From 86ddbaae7f32b079f7387e92d441568fd740e2b1 Mon Sep 17 00:00:00 2001 From: Jackson Weber Date: Fri, 22 May 2026 14:16:41 -0700 Subject: [PATCH 14/14] Trim cikey comment Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/sdkstats/metrics.ts | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/sdkstats/metrics.ts b/src/sdkstats/metrics.ts index 9e6fddc..83aacc0 100644 --- a/src/sdkstats/metrics.ts +++ b/src/sdkstats/metrics.ts @@ -134,10 +134,8 @@ export class SdkStatsMetrics { // Per spec/sdkstats.md the required customDimensions on every // SDKStats observation are: rp, attach, runtimeVersion, os, // language, version, cikey (plus endpoint/host on network gauges and - // statusCode/exceptionType where applicable). `cikey` is reported as - // "N/A" when the customer is not exporting to an Application Insights - // resource (e.g. OTLP-only / Console-only), so backend KQL queries - // don't have to filter out missing rows. + // statusCode/exceptionType where applicable). `cikey` falls back to + // "N/A" when unset. this.commonAttributes = { rp: "unknown", attach: "Manual",