diff --git a/src/directLine.mock.ts b/src/directLine.mock.ts index 152fba1a..fff353bd 100644 --- a/src/directLine.mock.ts +++ b/src/directLine.mock.ts @@ -29,6 +29,7 @@ export interface Conversation { export interface Server { scheduler: TestScheduler; conversation: Conversation; + webSocketUrl?: string; } const tokenPrefix = 'token'; @@ -211,6 +212,7 @@ type EventHandler = (this: WebSocket, ev: E) => any; export const mockWebSocket = (server: Server): WebSocketConstructor => class MockWebSocket implements WebSocket, ActivitySocket { constructor(url: string, protocols?: string | string[]) { + server.webSocketUrl = url; server.scheduler.schedule(() => { this.readyState = WebSocket.CONNECTING; @@ -285,3 +287,36 @@ export const mockServices = (server: Server, scheduler: TestScheduler): DirectLi ajax: mockAjax(server), random: () => 0, }); + +// Helper to inject agent.capabilities event with audio support +export const mockAgentCapabilitiesEvent = (): DirectLineExport.Activity => ({ + type: 'event', + from: { id: 'bot' }, + name: 'agent.capabilities', + value: { + modalities: { + text: {}, + audio: { + fonts: [], + tools: [], + instructions: [] + } + } + } +}); + +// Helper to inject agent.capabilities event into WebSocket +export const injectAgentCapabilities = (server: Server): void => { + const capabilitiesEvent = mockAgentCapabilitiesEvent(); + const activityGroup: DirectLineExport.ActivityGroup = { + activities: [capabilitiesEvent], + watermark: server.conversation.history.length.toString(), + }; + const message = new MessageEvent('type', { data: JSON.stringify(activityGroup) }); + server.conversation.sockets.forEach(s => s.onmessage(message)); +}; + +// Helper to check if WebSocket URL contains multimodal path +export const hasMultimodalUrl = (server: Server): boolean => { + return !!server.webSocketUrl?.includes('/stream/multimodal'); +}; diff --git a/src/directLine.test.ts b/src/directLine.test.ts index 5f6b7e39..89a58994 100644 --- a/src/directLine.test.ts +++ b/src/directLine.test.ts @@ -390,4 +390,396 @@ describe('MockSuite', () => { expect(postResult).toStrictEqual('retry'); }); }); + + describe('VoiceMode', () => { + + describe('enableVoiceMode: true (explicit)', () => { + + test('voice mode enabled and uses /stream/multimodal URL', () => { + directline = new DirectLineExport.DirectLine({ ...services, enableVoiceMode: true }); + + // Verify voice mode is enabled synchronously + expect(directline.getIsVoiceModeEnabled()).toBe(true); + + const scenario = function* (): IterableIterator> { + yield Observable.timer(200, scheduler); + }; + + subscriptions.push(lazyConcat(scenario()).observeOn(scheduler).subscribe()); + subscriptions.push(directline.activity$.subscribe()); + + scheduler.flush(); + + // Verify WebSocket URL contains /stream/multimodal + expect(DirectLineMock.hasMultimodalUrl(server)).toBe(true); + }); + + test('postActivity sends via WebSocket (does not echo back)', () => { + directline = new DirectLineExport.DirectLine({ ...services, enableVoiceMode: true }); + + const textActivity = DirectLineMock.mockActivity('hello-voice-mode'); + + let postCompleted = false; + const actual: Array = []; + + const scenario = function* (): IterableIterator> { + yield Observable.timer(200, scheduler); + yield directline.postActivity(textActivity).do(() => postCompleted = true); + yield Observable.timer(100, scheduler); + }; + + subscriptions.push(lazyConcat(scenario()).observeOn(scheduler).subscribe()); + subscriptions.push(directline.activity$.subscribe(a => actual.push(a))); + + scheduler.flush(); + + expect(postCompleted).toBe(true); + // WebSocket path: activity does NOT echo back (server doesn't broadcast WS-sent activities) + expect(actual).not.toContainEqual(textActivity); + }); + + test('reconnect after WebSocket close still uses /stream/multimodal URL', () => { + directline = new DirectLineExport.DirectLine({ ...services, enableVoiceMode: true }); + + // First verify initial connection uses multimodal URL + const scenario = function* (): IterableIterator> { + yield Observable.timer(200, scheduler); + }; + + subscriptions.push(lazyConcat(scenario()).observeOn(scheduler).subscribe()); + subscriptions.push(directline.activity$.subscribe()); + + scheduler.flush(); + + // Verify initial connection uses multimodal + expect(DirectLineMock.hasMultimodalUrl(server)).toBe(true); + + // Simulate WebSocket close (triggers reconnect) + DirectLineMock.injectClose(server); + + // Continue scheduler to allow reconnect + const reconnectScenario = function* (): IterableIterator> { + yield Observable.timer(200, scheduler); + }; + + subscriptions.push(lazyConcat(reconnectScenario()).observeOn(scheduler).subscribe()); + + scheduler.flush(); + + // After reconnect, should still use /stream/multimodal URL + expect(DirectLineMock.hasMultimodalUrl(server)).toBe(true); + expect(directline.getIsVoiceModeEnabled()).toBe(true); + }); + }); + + describe('enableVoiceMode: false (explicit)', () => { + + test('voice mode disabled and uses standard /stream URL', () => { + directline = new DirectLineExport.DirectLine({ ...services, enableVoiceMode: false }); + + // Verify voice mode is disabled + expect(directline.getIsVoiceModeEnabled()).toBe(false); + + const scenario = function* (): IterableIterator> { + yield Observable.timer(200, scheduler); + }; + + subscriptions.push(lazyConcat(scenario()).observeOn(scheduler).subscribe()); + subscriptions.push(directline.activity$.subscribe()); + + scheduler.flush(); + + // Verify WebSocket URL does NOT contain /stream/multimodal + expect(DirectLineMock.hasMultimodalUrl(server)).toBe(false); + }); + + test('postActivity sends via HTTP (echoes back)', () => { + directline = new DirectLineExport.DirectLine({ ...services, enableVoiceMode: false }); + + const textActivity = DirectLineMock.mockActivity('hello-http'); + + const actual: Array = []; + subscriptions.push(directline.activity$.subscribe(a => actual.push(a))); + + const scenario = function* (): IterableIterator> { + yield Observable.timer(200, scheduler); + yield directline.postActivity(textActivity); + yield Observable.timer(100, scheduler); + }; + + subscriptions.push(lazyConcat(scenario()).observeOn(scheduler).subscribe()); + + scheduler.flush(); + + // HTTP path: activity echoes back via activity$ (server broadcasts HTTP-posted activities) + expect(actual).toContainEqual(textActivity); + }); + + test('403 post returns retry and still uses standard /stream URL', () => { + services.ajax = DirectLineMock.mockAjax(server, (urlOrRequest) => { + if (typeof urlOrRequest === 'string') { + throw new Error(); + } + + if (urlOrRequest.url && urlOrRequest.url.indexOf('/conversations') > 0 && !/activities/u.test(urlOrRequest.url)) { + const response: Partial = { + response: server.conversation, + status: 201, + xhr: { getResponseHeader: () => 'n/a' } as unknown as XMLHttpRequest + }; + return response as AjaxResponse; + } + + if (urlOrRequest.url && /activities/u.test(urlOrRequest.url)) { + const response: Partial = { + status: 403, + xhr: { getResponseHeader: () => 'n/a' } as unknown as XMLHttpRequest + }; + const error = new Error('Forbidden'); + throw Object.assign(error, response); + } + + throw new Error(); + }); + + directline = new DirectLineExport.DirectLine({ ...services, enableVoiceMode: false }); + + const retryActivity = DirectLineMock.mockActivity('will-retry-false'); + const scenario = function* (): IterableIterator> { + yield Observable.timer(200, scheduler); + yield directline.postActivity(retryActivity); + }; + + let postResult: string | undefined; + subscriptions.push(lazyConcat(scenario()).observeOn(scheduler).subscribe({ + next: v => { postResult = v as string; }, + error: () => {}, + complete: () => {} + })); + + scheduler.flush(); + + expect(postResult).toStrictEqual('retry'); + expect(DirectLineMock.hasMultimodalUrl(server)).toBe(false); + }); + }); + + describe('enableVoiceMode: undefined (auto-detect)', () => { + + test('non-iframe: voice mode disabled and uses standard /stream URL', () => { + // Default test environment is not an iframe (window.self === window.top) + directline = new DirectLineExport.DirectLine({ ...services }); + + // Verify voice mode is disabled (synchronous - no iframe check needed) + expect(directline.getIsVoiceModeEnabled()).toBe(false); + + const scenario = function* (): IterableIterator> { + yield Observable.timer(200, scheduler); + }; + + subscriptions.push(lazyConcat(scenario()).observeOn(scheduler).subscribe()); + subscriptions.push(directline.activity$.subscribe()); + + scheduler.flush(); + + // Verify standard /stream URL (not multimodal) + expect(DirectLineMock.hasMultimodalUrl(server)).toBe(false); + }); + + test('non-iframe: 403 post returns retry and still uses standard /stream URL', () => { + services.ajax = DirectLineMock.mockAjax(server, (urlOrRequest) => { + if (typeof urlOrRequest === 'string') { + throw new Error(); + } + + if (urlOrRequest.url && urlOrRequest.url.indexOf('/conversations') > 0 && !/activities/u.test(urlOrRequest.url)) { + const response: Partial = { + response: server.conversation, + status: 201, + xhr: { getResponseHeader: () => 'n/a' } as unknown as XMLHttpRequest + }; + return response as AjaxResponse; + } + + if (urlOrRequest.url && /activities/u.test(urlOrRequest.url)) { + const response: Partial = { + status: 403, + xhr: { getResponseHeader: () => 'n/a' } as unknown as XMLHttpRequest + }; + const error = new Error('Forbidden'); + throw Object.assign(error, response); + } + + throw new Error(); + }); + + directline = new DirectLineExport.DirectLine({ ...services }); + + const retryActivity = DirectLineMock.mockActivity('will-retry-undefined'); + const scenario = function* (): IterableIterator> { + yield Observable.timer(200, scheduler); + yield directline.postActivity(retryActivity); + }; + + let postResult: string | undefined; + subscriptions.push(lazyConcat(scenario()).observeOn(scheduler).subscribe({ + next: v => { postResult = v as string; }, + error: () => {}, + complete: () => {} + })); + + scheduler.flush(); + + expect(postResult).toStrictEqual('retry'); + expect(DirectLineMock.hasMultimodalUrl(server)).toBe(false); + }); + + test('iframe WITH microphone permission: voice mode enabled and uses /stream/multimodal URL', async () => { + // Mock iframe detection: window.self !== window.top + const originalSelf = window.self; + Object.defineProperty(window, 'self', { + value: { notTop: true }, + writable: true, + configurable: true + }); + + // Mock permissionsPolicy.allowsFeature('microphone') to return true + const originalPermissionsPolicy = (document as any).permissionsPolicy; + (document as any).permissionsPolicy = { + allowsFeature: (feature: string) => feature === 'microphone' + }; + + try { + directline = new DirectLineExport.DirectLine({ ...services }); + await Promise.resolve(); + + const textActivity = DirectLineMock.mockActivity('iframe-with-mic'); + let postCompleted = false; + const actual: Array = []; + + const scenario = function* (): IterableIterator> { + yield Observable.timer(200, scheduler); + yield directline.postActivity(textActivity).do(() => postCompleted = true); + yield Observable.timer(100, scheduler); + }; + + subscriptions.push(lazyConcat(scenario()).observeOn(scheduler).subscribe()); + subscriptions.push(directline.activity$.subscribe(a => actual.push(a))); + + scheduler.flush(); + + expect(directline.getIsVoiceModeEnabled()).toBe(true); + + // Verify /stream/multimodal URL + expect(DirectLineMock.hasMultimodalUrl(server)).toBe(true); + // Verify WebSocket routing: activity does NOT echo back + expect(postCompleted).toBe(true); + expect(actual).not.toContainEqual(textActivity); + } finally { + Object.defineProperty(window, 'self', { + value: originalSelf, + writable: true, + configurable: true + }); + if (originalPermissionsPolicy) { + (document as any).permissionsPolicy = originalPermissionsPolicy; + } else { + delete (document as any).permissionsPolicy; + } + } + }); + + test('iframe WITHOUT microphone permission: voice mode disabled', async () => { + // Mock iframe detection: window.self !== window.top + const originalSelf = window.self; + Object.defineProperty(window, 'self', { + value: { notTop: true }, + writable: true, + configurable: true + }); + + // Mock permissionsPolicy.allowsFeature('microphone') to return false + const originalPermissionsPolicy = (document as any).permissionsPolicy; + (document as any).permissionsPolicy = { + allowsFeature: (feature: string) => false + }; + + try { + directline = new DirectLineExport.DirectLine({ ...services }); + + expect(directline.getIsVoiceModeEnabled()).toBe(false); + + const textActivity = DirectLineMock.mockActivity('iframe-no-mic'); + const actual: Array = []; + + const scenario = function* (): IterableIterator> { + yield Observable.timer(200, scheduler); + yield directline.postActivity(textActivity); + yield Observable.timer(100, scheduler); + }; + + subscriptions.push(lazyConcat(scenario()).observeOn(scheduler).subscribe()); + subscriptions.push(directline.activity$.subscribe(a => actual.push(a))); + + scheduler.flush(); + + // Verify standard /stream URL (not multimodal) + expect(DirectLineMock.hasMultimodalUrl(server)).toBe(false); + // Verify HTTP routing: activity echoes back + expect(actual).toContainEqual(textActivity); + } finally { + Object.defineProperty(window, 'self', { + value: originalSelf, + writable: true, + configurable: true + }); + if (originalPermissionsPolicy) { + (document as any).permissionsPolicy = originalPermissionsPolicy; + } else { + delete (document as any).permissionsPolicy; + } + } + }); + }); + + describe('Voice Configuration & Events', () => { + + test('getVoiceConfiguration returns undefined initially', () => { + directline = new DirectLineExport.DirectLine({ ...services }); + + expect(directline.getVoiceConfiguration()).toBeUndefined(); + }); + + test('agent.capabilities event sets voiceConfiguration and fires capabilitieschanged', () => { + directline = new DirectLineExport.DirectLine({ ...services }); + + let eventFired = false; + directline.addEventListener('capabilitieschanged', () => { + eventFired = true; + }); + + subscriptions.push(directline.activity$.subscribe()); + + const scenario = function* (): IterableIterator> { + yield Observable.timer(200, scheduler); + }; + + subscriptions.push(lazyConcat(scenario()).observeOn(scheduler).subscribe()); + + scheduler.flush(); + + // Inject agent.capabilities event + DirectLineMock.injectAgentCapabilities(server); + + // Verify voiceConfiguration is set + const config = directline.getVoiceConfiguration(); + expect(config).toBeDefined(); + expect(config?.sampleRate).toBe(24000); + expect(config?.chunkIntervalMs).toBe(100); + + // Verify capabilitieschanged event fired + expect(eventFired).toBe(true); + }); + }); + }); }); diff --git a/src/directLine.ts b/src/directLine.ts index 443e0416..d81fdc50 100644 --- a/src/directLine.ts +++ b/src/directLine.ts @@ -381,7 +381,14 @@ export interface DirectLineOptions { * If true, every outgoing activity will include deliveryMode: 'stream'. * If false/omitted, deliveryMode is not sent (defaults to 'normal' in ABS). */ - streaming?: boolean + streaming?: boolean, + /** + * Enable voice mode for audio streaming. + * - If true: voice mode enabled, uses /stream/multimodal endpoint, all traffic sent via WebSocket + * - If false: voice mode disabled, uses existing flow as is (/stream endpoint with http post) + * - If undefined: auto-detect for iframes with allow="microphone" attribute + */ + enableVoiceMode?: boolean } export interface Services { @@ -451,6 +458,52 @@ const konsole = { } } +/** + * Checks if the current context is running inside an iframe. + */ +const isInIframe = (): boolean => { + try { + return typeof window !== 'undefined' && window.self !== window.top; + } catch (e) { + // If accessing window.top throws (cross-origin), we're definitely in an iframe + return true; + } +} + +/** + * Checks if the iframe has microphone permission via the allow attribute. + */ +const hasIframeMicrophonePermission = async (): Promise => { + if (typeof window === 'undefined' || typeof document === 'undefined') { + return false; + } + + try { + // Try using the Permissions Policy API (Chrome 88+, Edge 88+) + const doc = document as any; + if (doc.permissionsPolicy && typeof doc.permissionsPolicy.allowsFeature === 'function') { + return doc.permissionsPolicy.allowsFeature('microphone'); + } + + // Fallback to deprecated Feature Policy API (Chrome 60-87, Edge 79-87) + if (doc.featurePolicy && typeof doc.featurePolicy.allowsFeature === 'function') { + return doc.featurePolicy.allowsFeature('microphone'); + } + + // Fallback to Permissions API (broader support: Chrome 43+, Firefox 46+, Safari 16+) + if (typeof navigator !== 'undefined' && navigator.permissions) { + const result = await navigator.permissions.query({ name: 'microphone' as PermissionName }); + // 'granted' or 'prompt' means microphone is allowed by iframe policy + // 'denied' means either user denied or iframe policy blocks it + return result.state !== 'denied'; + } + } catch (e) { + // If permissions check fails, assume microphone is not allowed in iframe + } + + return false; +} + export interface IBotConnection { connectionStatus$: BehaviorSubject, activity$: Observable, @@ -479,6 +532,7 @@ export class DirectLine implements IBotConnection { public referenceGrammarId: string; private timeout = 20 * 1000; private retries: number; + private webSocketConnection: WebSocket | null = null; private localeOnStartConversation: string; private userIdOnStartConversation: string; @@ -488,6 +542,19 @@ export class DirectLine implements IBotConnection { private tokenRefreshSubscription: Subscription; private streaming: boolean; + // Voice mode: when true, use multimodal stream endpoint and send all traffic via WebSocket + private voiceModeEnabled: boolean = false; + + // Voice configuration default constants + private static readonly VOICE_SAMPLE_RATE = 24000; + private static readonly VOICE_CHUNK_INTERVAL_MS = 100; + + // Voice configuration: set when server supports audio modality, undefined otherwise + private voiceConfiguration: { sampleRate: number; chunkIntervalMs: number } | undefined; + + // EventTarget for dispatching capability change events + private eventTarget = new EventTarget(); + constructor(options: DirectLineOptions & Partial) { this.secret = options.secret; this.token = options.secret || options.token; @@ -497,6 +564,9 @@ export class DirectLine implements IBotConnection { this.streaming = options.streaming; } + // Initialize voice mode detection (sets voiceModeEnabled synchronously for non-iframe cases) + this.initializeVoiceMode(options.enableVoiceMode); + if (options.conversationStartProperties && options.conversationStartProperties.locale) { if (Object.prototype.toString.call(options.conversationStartProperties.locale) === '[object String]') { this.localeOnStartConversation = options.conversationStartProperties.locale; @@ -785,6 +855,29 @@ export class DirectLine implements IBotConnection { if (activity.type === "message" && activity.attachments && activity.attachments.length > 0) return this.postMessageWithAttachments(activity); + // When voice mode is enabled, send ALL traffic (text + voice) via WebSocket + if (this.voiceModeEnabled) { + if (!this.webSocket) { + return Observable.throw(new Error('Voice mode requires WebSocket to be enabled'), this.services.scheduler); + } + return this.checkConnection(true) + .flatMap(_ => + Observable.create((subscriber: Subscriber) => { + try { + if (!this.webSocketConnection || this.webSocketConnection.readyState !== WebSocket.OPEN) { + throw new Error('WebSocket connection not ready for voice activities'); + } + this.webSocketConnection.send(JSON.stringify(activity)); + subscriber.next(activity); + subscriber.complete(); + } catch (e) { + subscriber.error(e); + } + }) + ) + .catch(error => this.catchExpiredToken(error)); + } + // If we're not connected to the bot, get connected // Will throw an error if we are not connected konsole.log("postActivity", activity); @@ -957,12 +1050,15 @@ export class DirectLine implements IBotConnection { // implementation, I decided roll the below, where the logic is more purposeful. - @billba private observableWebSocket() { return Observable.create((subscriber: Subscriber) => { - konsole.log("creating WebSocket", this.streamUrl); - const ws = new this.services.WebSocket(this.streamUrl); + // Apply multimodal stream URL if voice mode is enabled + const streamUrl = this.getMultimodalStreamUrl(this.streamUrl); + + konsole.log("creating WebSocket", streamUrl); + this.webSocketConnection = new this.services.WebSocket(streamUrl); let sub: Subscription; let closed: boolean; - ws.onopen = open => { + this.webSocketConnection.onopen = open => { konsole.log("WebSocket open", open); // Chrome is pretty bad at noticing when a WebSocket connection is broken. // If we periodically ping the server with empty messages, it helps Chrome @@ -970,14 +1066,14 @@ export class DirectLine implements IBotConnection { // error, and that give us the opportunity to attempt to reconnect. sub = Observable.interval(this.timeout, this.services.scheduler).subscribe(_ => { try { - ws.send("") + this.webSocketConnection.send("") } catch(e) { konsole.log("Ping error", e); } }); } - ws.onclose = close => { + this.webSocketConnection.onclose = close => { konsole.log("WebSocket close", close); if (sub) sub.unsubscribe(); @@ -987,7 +1083,7 @@ export class DirectLine implements IBotConnection { closed = true; } - ws.onerror = error => { + this.webSocketConnection.onerror = error => { konsole.log("WebSocket error", error); if (sub) sub.unsubscribe(); @@ -997,14 +1093,20 @@ export class DirectLine implements IBotConnection { closed = true; } - ws.onmessage = message => message.data && subscriber.next(JSON.parse(message.data)); + this.webSocketConnection.onmessage = message => { + if (message.data) { + const data = JSON.parse(message.data); + this.handleIncomingActivity(data); + subscriber.next(data); + } + }; // This is the 'unsubscribe' method, which is called when this observable is disposed. // When the WebSocket closes itself, we throw an error, and this function is eventually called. // When the observable is closed first (e.g. when tearing down a WebChat instance) then // we need to manually close the WebSocket. return () => { - if (ws.readyState === 0 || ws.readyState === 1) ws.close(); + if (this.webSocketConnection.readyState === 0 || this.webSocketConnection.readyState === 1) this.webSocketConnection.close(); } }) as Observable } @@ -1079,6 +1181,38 @@ export class DirectLine implements IBotConnection { this.userIdOnStartConversation = userId; } + /** + * Returns voice configuration from server's agent.capabilities event, or undefined if server doesn't support audio. + * Use this to configure microphone settings. Only available after server confirms audio support. + */ + getVoiceConfiguration() { + return this.voiceConfiguration; + } + + /** + * Returns true if multimodal experience is requested (client-side), false otherwise. + * Does NOT guarantee server supports voice - use getVoiceConfiguration() for that. + * Use this to determine if activities are sent via WebSocket (no echo-back wait needed). + */ + getIsVoiceModeEnabled(): boolean { + return !!this.voiceModeEnabled; + } + + /** + * Adds an event listener for adapter events (e.g., 'capabilitieschanged'). + * Used by consumer to subscribe to capability updates. + */ + addEventListener(type: string, listener: EventListenerOrEventListenerObject, options?: boolean | AddEventListenerOptions): void { + this.eventTarget.addEventListener(type, listener, options); + } + + /** + * Removes an event listener for adapter events. + */ + removeEventListener(type: string, listener: EventListenerOrEventListenerObject, options?: boolean | EventListenerOptions): void { + this.eventTarget.removeEventListener(type, listener, options); + } + private parseToken(token: string) { try { const { user } = jwtDecode(token) as { [key: string]: any; }; @@ -1090,4 +1224,77 @@ export class DirectLine implements IBotConnection { } } + /** + * Initialize voice mode. + * - Explicit true/false: set synchronously (no race condition) + * - Undefined: auto-detect for iframes with microphone permission (async, best effort) + */ + private initializeVoiceMode(enableVoiceMode?: boolean): void { + // Explicit true: enable synchronously + if (enableVoiceMode === true) { + this.voiceModeEnabled = true; + this.eventTarget.dispatchEvent(new Event('capabilitieschanged')); + return; + } + + // Explicit false: already false by default, nothing to do + if (enableVoiceMode === false) { + return; + } + + // Undefined: auto-detect for iframe with microphone permission (async) + if (isInIframe()) { + hasIframeMicrophonePermission().then(hasMic => { + if (hasMic) { + this.voiceModeEnabled = true; + this.eventTarget.dispatchEvent(new Event('capabilitieschanged')); + } + }); + } + } + + /** + * Handles incoming activity group to check for agent.capabilities event. + * Sets voice configuration if server supports audio modality. + */ + private handleIncomingActivity(data: any): void { + const activities = data?.activities; + if (!Array.isArray(activities)) { + return; + } + + for (const activity of activities) { + if (activity?.type === 'event' && activity?.name === 'agent.capabilities') { + const modalities = activity?.value?.modalities; + const hasAudio = modalities?.audio && + typeof modalities.audio === 'object' && + Object.keys(modalities.audio).length > 0; + + if (hasAudio) { + this.voiceConfiguration = { + sampleRate: DirectLine.VOICE_SAMPLE_RATE, + chunkIntervalMs: DirectLine.VOICE_CHUNK_INTERVAL_MS + }; + this.eventTarget.dispatchEvent(new Event('capabilitieschanged')); + } + } + } + } + + /** + * Modifies stream URL for voice mode: replaces /stream with /stream/multimodal + */ + private getMultimodalStreamUrl(url: string): string { + if (!this.voiceModeEnabled || !url) { + return url; + } + + // Replace /stream endpoint with /stream/multimodal (if not already multimodal) + if (!url.includes('/stream/multimodal')) { + return url.replace('/stream', '/stream/multimodal'); + } + + return url; + } + }