diff --git a/CHANGELOG.md b/CHANGELOG.md index 36347767d..d5f4df954 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Release History +## Unreleased + +### Bug Fixes + +- `AutoCollectExceptions` no longer calls `forceFlush` per exception and now rate-limits exception telemetry to 50 records/min (emitting a single suppressed-count summary per window). `forceFlush` still runs on the terminal exit path. + ### 3.15.0 (2026-05-13) #### Breaking Changes diff --git a/src/logs/exceptions.ts b/src/logs/exceptions.ts index d98c8045d..1246fbe5a 100644 --- a/src/logs/exceptions.ts +++ b/src/logs/exceptions.ts @@ -11,10 +11,20 @@ const UNHANDLED_REJECTION_HANDLER_NAME: ExceptionHandle = "unhandledRejection"; const FALLBACK_ERROR_MESSAGE = "A promise was rejected without providing an error. Application Insights generated this error stack for you."; +// Token-bucket rate limit for exception telemetry to prevent a misbehaving +// workload from becoming a request-rate amplifier against the ingestion +// endpoint. When the bucket is exhausted, exceptions are dropped and a single +// summary record is emitted describing how many were suppressed. +const EXCEPTION_RATE_LIMIT_CAPACITY = 50; // max exceptions tracked per window +const EXCEPTION_RATE_LIMIT_WINDOW_MS = 60_000; // 1 minute refill window + export class AutoCollectExceptions { private _exceptionListenerHandle?: (error: Error | undefined) => void; private _rejectionListenerHandle?: (error: Error | undefined) => void; private _client: LogApi; + private _tokens: number = EXCEPTION_RATE_LIMIT_CAPACITY; + private _lastRefillTime: number = Date.now(); + private _suppressedCount: number = 0; constructor(client: LogApi) { this._client = client; @@ -59,25 +69,77 @@ export class AutoCollectExceptions { delete this._rejectionListenerHandle; } + /** + * Refill the token bucket if at least one full window has elapsed since + * the last refill. If the bucket was previously exhausted, emit a single + * summary describing how many exceptions were suppressed. + */ + private _refillTokens(): void { + const now = Date.now(); + if (now - this._lastRefillTime >= EXCEPTION_RATE_LIMIT_WINDOW_MS) { + this._lastRefillTime = now; + this._tokens = EXCEPTION_RATE_LIMIT_CAPACITY; + if (this._suppressedCount > 0 && this._client) { + const suppressed = this._suppressedCount; + this._suppressedCount = 0; + try { + this._client.trackException({ + exception: new Error( + `Application Insights suppressed ${suppressed} exception telemetry record(s) in the last ${EXCEPTION_RATE_LIMIT_WINDOW_MS / 1000}s due to client-side rate limiting.` + ), + }); + } catch { + // best-effort; do not let summary emission throw from the handler + } + } + } + } + private _handleException( reThrow: boolean, name: ExceptionHandle, error: Error | undefined = new Error(FALLBACK_ERROR_MESSAGE) ) { if (this._client) { - this._client.trackException({ exception: error }); - try { - (logs.getLoggerProvider() as LoggerProvider).forceFlush().then(() => { - // only rethrow when we are the only listener - if (reThrow && name && process.listeners(name as any).length === 1) { + this._refillTokens(); + const isTerminal = + reThrow && !!name && process.listeners(name as any).length === 1; + + // Always record the terminal/fatal exception, bypassing the rate + // limit — by definition it can only occur once per process. + if (isTerminal || this._tokens > 0) { + if (!isTerminal) { + this._tokens--; + } + this._client.trackException({ exception: error }); + } else { + this._suppressedCount++; + } + + // Only force a flush on the terminal path where the process is + // about to exit. Under normal operation, let the + // BatchLogRecordProcessor batch exception records to avoid + // amplifying request rate against the ingestion endpoint. + if (isTerminal) { + try { + (logs.getLoggerProvider() as LoggerProvider).forceFlush().then(() => { // eslint-disable-next-line no-console console.error(error); // eslint-disable-next-line no-process-exit process.exit(1); - } - }); - } catch (error) { - console.error(`Could not get the loggerProvider upon handling a tracked exception: ${error}`); + }, () => { + // eslint-disable-next-line no-console + console.error(error); + // eslint-disable-next-line no-process-exit + process.exit(1); + }); + } catch (flushError) { + console.error(`Could not get the loggerProvider upon handling a tracked exception: ${flushError}`); + // eslint-disable-next-line no-console + console.error(error); + // eslint-disable-next-line no-process-exit + process.exit(1); + } } } else { // eslint-disable-next-line no-console diff --git a/test/unitTests/logs/exceptions.tests.ts b/test/unitTests/logs/exceptions.tests.ts index 92c1b2fb0..be71a40ff 100644 --- a/test/unitTests/logs/exceptions.tests.ts +++ b/test/unitTests/logs/exceptions.tests.ts @@ -2,6 +2,8 @@ // Licensed under the MIT license. import assert from "assert"; import sinon from "sinon"; +import { logs } from "@opentelemetry/api-logs"; +import { LoggerProvider } from "@opentelemetry/sdk-logs"; import { AutoCollectExceptions } from "../../../src/logs/exceptions"; @@ -40,4 +42,78 @@ describe("AutoCollection/Exceptions", () => { assert.equal(processRemoveListenerSpy.getCall(0).args[0], "uncaughtException"); assert.equal(processRemoveListenerSpy.getCall(1).args[0], "unhandledRejection"); }); + + it("does not forceFlush on non-terminal (unhandledRejection) exceptions", () => { + const trackException = sandbox.stub(); + const forceFlush = sandbox.stub().resolves(); + sandbox + .stub(logs, "getLoggerProvider") + .returns({ forceFlush } as unknown as LoggerProvider); + + const fakeClient: any = { trackException }; + const exceptions = new AutoCollectExceptions(fakeClient); + try { + // Invoke the rejection handler (reThrow=false → never terminal) + (exceptions as any)._handleException( + false, + "unhandledRejection", + new Error("boom") + ); + assert.equal(trackException.callCount, 1, "should track the exception"); + assert.equal( + forceFlush.callCount, + 0, + "must NOT forceFlush per exception under normal operation" + ); + } finally { + exceptions.shutdown(); + } + }); + + it("rate-limits exception telemetry and emits a single suppressed-count summary", () => { + const trackException = sandbox.stub(); + sandbox + .stub(logs, "getLoggerProvider") + .returns({ forceFlush: sandbox.stub().resolves() } as unknown as LoggerProvider); + + const fakeClient: any = { trackException }; + const exceptions = new AutoCollectExceptions(fakeClient); + try { + // Fire many more than the bucket capacity (50) within the window. + for (let i = 0; i < 500; i++) { + (exceptions as any)._handleException( + false, + "unhandledRejection", + new Error(`err-${i}`) + ); + } + assert.equal( + trackException.callCount, + 50, + "should cap tracked exceptions at the bucket capacity within one window" + ); + + // Advance the window by manipulating internal timestamp, then fire + // one more — refill should emit one summary record + the new one. + (exceptions as any)._lastRefillTime = Date.now() - 61_000; + (exceptions as any)._handleException( + false, + "unhandledRejection", + new Error("after-window") + ); + assert.equal( + trackException.callCount, + 52, + "after window refill, expect one summary record + the new tracked exception" + ); + const summaryArg = trackException.getCall(50).args[0]; + assert.match( + summaryArg.exception.message, + /suppressed 450 exception telemetry record/, + "summary record should report the number of suppressed exceptions" + ); + } finally { + exceptions.shutdown(); + } + }); });