Make spatial audio Firefox-only

Hopefully explained in comment: we have a heisenbug where we sometimes lack audio from a certain participant, so this simplifies the audio path by removing the workaround required to do AEC with spatial audio on chrome.
2022-12-16 17:12:17 +00:00 · 2022-12-16 17:12:17 +00:00 · 223793a445
commit 223793a445
parent f9845617b3
11 changed files with 94 additions and 208 deletions
--- a/src/input/Input.module.css
+++ b/src/input/Input.module.css
@ -151,6 +151,15 @@
  margin-right: 10px;
 }

+.checkboxField.disabled,
+.checkboxField.disabled .description {
+  color: var(--quinary-content);
+}
+
+.checkboxField.disabled .checkbox {
+  border-color: var(--quinary-content);
+}
+
 .checkbox svg {
  display: none;
 }
--- a/src/room/InCallView.tsx
+++ b/src/room/InCallView.tsx
@ -50,19 +50,19 @@ import { Avatar } from "../Avatar";
 import { UserMenuContainer } from "../UserMenuContainer";
 import { useRageshakeRequestModal } from "../settings/submit-rageshake";
 import { RageshakeRequestModal } from "./RageshakeRequestModal";
-import { useMediaHandler } from "../settings/useMediaHandler";
 import { useShowInspector, useSpatialAudio } from "../settings/useSetting";
 import { useModalTriggerState } from "../Modal";
 import { useAudioContext } from "../video-grid/useMediaStream";
 import { useFullscreen } from "../video-grid/useFullscreen";
-import { AudioContainer } from "../video-grid/AudioContainer";
-import { useAudioOutputDevice } from "../video-grid/useAudioOutputDevice";
 import { PosthogAnalytics } from "../PosthogAnalytics";
 import { widget, ElementWidgetActions } from "../widget";
 import { useJoinRule } from "./useJoinRule";
 import { useUrlParams } from "../UrlParams";
 import { usePrefersReducedMotion } from "../usePrefersReducedMotion";
-import { ConnectionState, ParticipantInfo } from "./useGroupCall";
+import { TileDescriptor } from "../video-grid/TileDescriptor";
+import { ParticipantInfo } from "./useGroupCall";
+import { AudioSink } from "../video-grid/AudioSink";
+import { useMediaHandler } from "../settings/useMediaHandler";

 const canScreenshare = "getDisplayMedia" in (navigator.mediaDevices ?? {});
 // There is currently a bug in Safari our our code with cloning and sending MediaStreams
@ -91,18 +91,6 @@ interface Props {
  hideHeader: boolean;
 }

-// Represents something that should get a tile on the layout,
-// ie. a user's video feed or a screen share feed.
-export interface TileDescriptor {
-  id: string;
-  member: RoomMember;
-  focused: boolean;
-  presenter: boolean;
-  callFeed?: CallFeed;
-  isLocal?: boolean;
-  connectionState: ConnectionState;
-}
-
 export function InCallView({
  client,
  groupCall,
@ -145,15 +133,12 @@ export function InCallView({

  const [spatialAudio] = useSpatialAudio();

-  const [audioContext, audioDestination, audioRef] = useAudioContext();
-  const { audioOutput } = useMediaHandler();
+  const [audioContext, audioDestination] = useAudioContext();
  const [showInspector] = useShowInspector();

  const { modalState: feedbackModalState, modalProps: feedbackModalProps } =
    useModalTriggerState();

-  useAudioOutputDevice(audioRef, audioOutput);
-
  const { hideScreensharing } = useUrlParams();

  useEffect(() => {
@ -347,16 +332,30 @@ export function InCallView({
    [styles.maximised]: maximisedParticipant,
  });

+  // If spatial audio is disabled, we render one audio tag for each participant
+  // (with spatial audio, all the audio goes via the Web Audio API)
+  // We also do this if there's a feed maximised because we only trigger spatial
+  // audio rendering for feeds that we're displaying, which will need to be fixed
+  // once we start having more participants than we can fit on a screen, but this
+  // is a workaround for now.
+  const { audioOutput } = useMediaHandler();
+  const audioElements: JSX.Element[] = [];
+  if (!spatialAudio || maximisedParticipant) {
+    for (const item of items) {
+      if (item.isLocal) continue; // We don't want to render own audio
+      audioElements.push(
+        <AudioSink
+          tileDescriptor={item}
+          audioOutput={audioOutput}
+          key={item.id}
+        />
+      );
+    }
+  }
+
  return (
    <div className={containerClasses} ref={containerRef}>
-      <audio ref={audioRef} />
-      {(!spatialAudio || maximisedParticipant) && (
-        <AudioContainer
-          items={items}
-          audioContext={audioContext}
-          audioDestination={audioDestination}
-        />
-      )}
+      <>{audioElements}</>
      {!hideHeader && !maximisedParticipant && (
        <Header>
          <LeftNav>
--- a/src/settings/SettingsModal.tsx
+++ b/src/settings/SettingsModal.tsx
@ -32,6 +32,7 @@ import {
  useSpatialAudio,
  useShowInspector,
  useOptInAnalytics,
+  canEnableSpatialAudio,
 } from "./useSetting";
 import { FieldRow, InputField } from "../input/Input";
 import { Button } from "../button";
@ -115,9 +116,14 @@ export const SettingsModal = (props: Props) => {
              label={t("Spatial audio")}
              type="checkbox"
              checked={spatialAudio}
-              description={t(
-                "This will make a speaker's audio seem as if it is coming from where their tile is positioned on screen. (Experimental feature: this may impact the stability of audio.)"
-              )}
+              disabled={!canEnableSpatialAudio()}
+              description={
+                canEnableSpatialAudio()
+                  ? t(
+                      "This will make a speaker's audio seem as if it is coming from where their tile is positioned on screen. (Experimental feature: this may impact the stability of audio.)"
+                    )
+                  : t("This feature is only supported on Firefox.")
+              }
              onChange={(event: React.ChangeEvent<HTMLInputElement>) =>
                setSpatialAudio(event.target.checked)
              }
--- a/src/settings/useSetting.ts
+++ b/src/settings/useSetting.ts
@ -58,7 +58,26 @@ export const getSetting = <T>(name: string, defaultValue: T): T => {
  return item === null ? defaultValue : JSON.parse(item);
 };

-export const useSpatialAudio = () => useSetting("spatial-audio", false);
+export const canEnableSpatialAudio = () => {
+  const { userAgent } = navigator;
+  // Spatial audio means routing audio through audio contexts. On Chrome,
+  // this bypasses the AEC processor and so breaks echo cancellation.
+  // We only allow spatial audio to be enabled on Firefox which we know
+  // passes audio context audio through the AEC algorithm.
+  // https://bugs.chromium.org/p/chromium/issues/detail?id=687574 is the
+  // chrome bug for this: once this is fixed and the updated version is deployed
+  // widely enough, we can allow spatial audio everywhere. It's currently in a
+  // chrome flag, so we could enable this in Electron if we enabled the chrome flag
+  // in the Electron wrapper.
+  return userAgent.includes("Firefox");
+};
+
+export const useSpatialAudio = (): [boolean, (val: boolean) => void] => {
+  const settingVal = useSetting("spatial-audio", false);
+  if (canEnableSpatialAudio()) return settingVal;
+
+  return [false, (_: boolean) => {}];
+};
 export const useShowInspector = () => useSetting("show-inspector", false);
 export const useOptInAnalytics = () => useSetting("opt-in-analytics", false);
 export const useKeyboardShortcuts = () =>
--- a/src/video-grid/AudioContainer.tsx
+++ b/src/video-grid/AudioContainer.tsx
@ -1,96 +0,0 @@
-/*
-Copyright 2022 New Vector Ltd
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-import React, { FC, useEffect, useRef } from "react";
-
-import { TileDescriptor } from "../room/InCallView";
-import { useCallFeed } from "./useCallFeed";
-import { useMediaStreamTrackCount } from "./useMediaStream";
-
-// XXX: These in fact do not render anything but to my knowledge this is the
-// only way to a hook on an array
-
-interface AudioForParticipantProps {
-  item: TileDescriptor;
-  audioContext: AudioContext;
-  audioDestination: AudioNode;
-}
-
-export const AudioForParticipant: FC<AudioForParticipantProps> = ({
-  item,
-  audioContext,
-  audioDestination,
-}) => {
-  const { stream, localVolume } = useCallFeed(item.callFeed);
-  const [audioTrackCount] = useMediaStreamTrackCount(stream);
-
-  const gainNodeRef = useRef<GainNode>();
-  const sourceRef = useRef<MediaStreamAudioSourceNode>();
-
-  useEffect(() => {
-    // We don't compare the audioMuted flag of useCallFeed here, since unmuting
-    // depends on to-device messages which may lag behind the audio actually
-    // starting to flow over the network
-    if (!item.isLocal && audioContext && audioTrackCount > 0) {
-      if (!gainNodeRef.current) {
-        gainNodeRef.current = new GainNode(audioContext, {
-          gain: localVolume,
-        });
-      }
-      if (!sourceRef.current) {
-        sourceRef.current = audioContext.createMediaStreamSource(stream);
-      }
-
-      const source = sourceRef.current;
-      const gainNode = gainNodeRef.current;
-
-      gainNode.gain.value = localVolume;
-      source.connect(gainNode).connect(audioDestination);
-
-      return () => {
-        source.disconnect();
-        gainNode.disconnect();
-      };
-    }
-  }, [
-    item,
-    audioContext,
-    audioDestination,
-    stream,
-    localVolume,
-    audioTrackCount,
-  ]);
-
-  return null;
-};
-
-interface AudioContainerProps {
-  items: TileDescriptor[];
-  audioContext: AudioContext;
-  audioDestination: AudioNode;
-}
-
-export const AudioContainer: FC<AudioContainerProps> = ({ items, ...rest }) => {
-  return (
-    <>
-      {items
-        .filter((item) => !item.isLocal)
-        .map((item) => (
-          <AudioForParticipant key={item.id} item={item} {...rest} />
-        ))}
-    </>
-  );
-};
--- a/src/video-grid/VideoGrid.stories.tsx
+++ b/src/video-grid/VideoGrid.stories.tsx
@ -21,8 +21,8 @@ import { RoomMember } from "matrix-js-sdk";
 import { VideoGrid, useVideoGridLayout } from "./VideoGrid";
 import { VideoTile } from "./VideoTile";
 import { Button } from "../button";
-import { TileDescriptor } from "../room/InCallView";
 import { ConnectionState } from "../room/useGroupCall";
+import { TileDescriptor } from "./TileDescriptor";

 export default {
  title: "VideoGrid",
--- a/src/video-grid/VideoGrid.tsx
+++ b/src/video-grid/VideoGrid.tsx
@ -23,7 +23,7 @@ import { ReactDOMAttributes } from "@use-gesture/react/dist/declarations/src/typ

 import styles from "./VideoGrid.module.css";
 import { Layout } from "../room/GridLayoutMenu";
-import { TileDescriptor } from "../room/InCallView";
+import { TileDescriptor } from "./TileDescriptor";

 interface TilePosition {
  x: number;
--- a/src/video-grid/VideoTileContainer.tsx
+++ b/src/video-grid/VideoTileContainer.tsx
@ -25,7 +25,7 @@ import { useRoomMemberName } from "./useRoomMemberName";
 import { VideoTile } from "./VideoTile";
 import { VideoTileSettingsModal } from "./VideoTileSettingsModal";
 import { useModalTriggerState } from "../Modal";
-import { TileDescriptor } from "../room/InCallView";
+import { TileDescriptor } from "./TileDescriptor";

 interface Props {
  item: TileDescriptor;
@ -72,7 +72,7 @@ export function VideoTileContainer({
    audioContext,
    audioDestination,
    localVolume,
-    isLocal || maximised
+    isLocal
  );
  const {
    modalState: videoTileSettingsModalState,
--- a/src/video-grid/useAudioOutputDevice.ts
+++ b/src/video-grid/useAudioOutputDevice.ts
@ -16,6 +16,8 @@ limitations under the License.

 import { RefObject, useEffect } from "react";

+// Uses setSinkId on an audio output element to set the device it outputs to,
+// where supported by the browser.
 export function useAudioOutputDevice(
  mediaRef: RefObject<MediaElement>,
  audioOutputDevice: string | undefined
--- a/src/video-grid/useFullscreen.tsx
+++ b/src/video-grid/useFullscreen.tsx
@ -17,8 +17,8 @@ limitations under the License.

 import { useCallback, useEffect, useState } from "react";

-import { TileDescriptor } from "../room/InCallView";
 import { useEventTarget } from "../useEvents";
+import { TileDescriptor } from "./TileDescriptor";
 import { useCallFeed } from "./useCallFeed";

 export function useFullscreen(ref: React.RefObject<HTMLElement>): {
--- a/src/video-grid/useMediaStream.ts
+++ b/src/video-grid/useMediaStream.ts
@ -15,7 +15,6 @@ limitations under the License.
 */

 import { useRef, useEffect, RefObject, useState, useCallback } from "react";
-import { parse as parseSdp, write as writeSdp } from "sdp-transform";
 import {
  acquireContext,
  releaseContext,
@ -64,6 +63,8 @@ export const useMediaStreamTrackCount = (
  return [audioTrackCount, videoTrackCount];
 };

+// Binds a media stream to a media output element, returning a ref for the
+// media element that should then be passed to the media element to be used.
 export const useMediaStream = (
  stream: MediaStream | null,
  audioOutputDevice: string | null,
@ -78,7 +79,7 @@ export const useMediaStream = (
    console.log(
      `useMediaStream update stream mediaRef.current ${!!mediaRef.current} stream ${
        stream && stream.id
-      }`
+      } muted ${mute}`
    );

    if (mediaRef.current) {
@ -127,89 +128,30 @@ export const useMediaStream = (
  return mediaRef;
 };

-// Loops the given audio stream back through a local peer connection, to make
-// AEC work with Web Audio streams on Chrome. The resulting stream should be
-// played through an audio element.
-// This hack can be removed once the following bug is resolved:
-// https://bugs.chromium.org/p/chromium/issues/detail?id=687574
-const createLoopback = async (stream: MediaStream): Promise<MediaStream> => {
-  // Prepare our local peer connections
-  const conn = new RTCPeerConnection();
-  const loopbackConn = new RTCPeerConnection();
-  const loopbackStream = new MediaStream();
-
-  conn.addEventListener("icecandidate", ({ candidate }) => {
-    if (candidate) loopbackConn.addIceCandidate(new RTCIceCandidate(candidate));
-  });
-  loopbackConn.addEventListener("icecandidate", ({ candidate }) => {
-    if (candidate) conn.addIceCandidate(new RTCIceCandidate(candidate));
-  });
-  loopbackConn.addEventListener("track", ({ track }) =>
-    loopbackStream.addTrack(track)
-  );
-
-  // Hook the connections together
-  stream.getTracks().forEach((track) => conn.addTrack(track));
-  const offer = await conn.createOffer({
-    offerToReceiveAudio: false,
-    offerToReceiveVideo: false,
-  });
-  await conn.setLocalDescription(offer);
-
-  await loopbackConn.setRemoteDescription(offer);
-  const answer = await loopbackConn.createAnswer();
-  // Rewrite SDP to be stereo and (variable) max bitrate
-  const parsedSdp = parseSdp(answer.sdp!);
-  parsedSdp.media.forEach((m) =>
-    m.fmtp.forEach(
-      (f) => (f.config += `;stereo=1;cbr=0;maxaveragebitrate=510000;`)
-    )
-  );
-  answer.sdp = writeSdp(parsedSdp);
-
-  await loopbackConn.setLocalDescription(answer);
-  await conn.setRemoteDescription(answer);
-
-  return loopbackStream;
-};
-
-export const useAudioContext = (): [
-  AudioContext,
-  AudioNode,
-  RefObject<MediaElement>
-] => {
+// Provides a properly refcounted instance of the shared audio context,
+// along with the context's destination audio node and a ref to be used
+// for the <audio> sink element.
+export const useAudioContext = (): [AudioContext, AudioNode] => {
  const context = useRef<AudioContext>();
  const destination = useRef<AudioNode>();
-  const audioRef = useRef<MediaElement>();

  useEffect(() => {
-    if (audioRef.current && !context.current) {
+    if (!context.current) {
      context.current = acquireContext();

-      if (window.chrome) {
-        // We're in Chrome, which needs a loopback hack applied to enable AEC
-        const streamDest = context.current.createMediaStreamDestination();
-        destination.current = streamDest;
-
-        const audioEl = audioRef.current;
-        (async () => {
-          audioEl.srcObject = await createLoopback(streamDest.stream);
-          await audioEl.play();
-        })();
-        return () => {
-          audioEl.srcObject = null;
-          releaseContext();
-        };
-      } else {
-        destination.current = context.current.destination;
-        return releaseContext;
-      }
+      destination.current = context.current.destination;
+      return releaseContext;
    }
  }, []);

-  return [context.current!, destination.current!, audioRef];
+  return [context.current!, destination.current!];
 };

+// Either renders a media stream with spatial audio or is just a no-op wrapper
+// around useMediaStream, depending on whether spatial audio is enabled.
+// Returns refs for the tile element from which the position is derived and
+// a <video> element to render the video to.
+// (hooks can't be conditional so we must use the same hook in each case).
 export const useSpatialMediaStream = (
  stream: MediaStream | null,
  audioContext: AudioContext,
@ -219,7 +161,12 @@ export const useSpatialMediaStream = (
 ): [RefObject<HTMLDivElement>, RefObject<MediaElement>] => {
  const tileRef = useRef<HTMLDivElement | null>(null);
  const [spatialAudio] = useSpatialAudio();
-  // We always handle audio separately form the video element
+
+  // This media stream is only used for the video - the audio goes via the audio
+  // context, so the audio output doesn't matter and the element is always muted
+  // (we could split the video out into a separate stream with just the video track
+  // and pass that as the srcObject of the element, but it seems unnecessary when we
+  // can just mute the element).
  const mediaRef = useMediaStream(stream, null, true);
  const [audioTrackCount] = useMediaStreamTrackCount(stream);