Make spatial audio Firefox-only

Hopefully explained in comment: we have a heisenbug where we sometimes lack audio from a certain participant, so this simplifies the audio path by removing the workaround required to do AEC with spatial audio on chrome.
2022-12-16 17:12:17 +00:00 · 2022-12-16 17:12:17 +00:00 · 223793a445
commit 223793a445
parent f9845617b3
11 changed files with 94 additions and 208 deletions
--- a/src/video-grid/AudioContainer.tsx
+++ b/src/video-grid/AudioContainer.tsx
@ -1,96 +0,0 @@
-/*
-Copyright 2022 New Vector Ltd
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-import React, { FC, useEffect, useRef } from "react";
-
-import { TileDescriptor } from "../room/InCallView";
-import { useCallFeed } from "./useCallFeed";
-import { useMediaStreamTrackCount } from "./useMediaStream";
-
-// XXX: These in fact do not render anything but to my knowledge this is the
-// only way to a hook on an array
-
-interface AudioForParticipantProps {
-  item: TileDescriptor;
-  audioContext: AudioContext;
-  audioDestination: AudioNode;
-}
-
-export const AudioForParticipant: FC<AudioForParticipantProps> = ({
-  item,
-  audioContext,
-  audioDestination,
-}) => {
-  const { stream, localVolume } = useCallFeed(item.callFeed);
-  const [audioTrackCount] = useMediaStreamTrackCount(stream);
-
-  const gainNodeRef = useRef<GainNode>();
-  const sourceRef = useRef<MediaStreamAudioSourceNode>();
-
-  useEffect(() => {
-    // We don't compare the audioMuted flag of useCallFeed here, since unmuting
-    // depends on to-device messages which may lag behind the audio actually
-    // starting to flow over the network
-    if (!item.isLocal && audioContext && audioTrackCount > 0) {
-      if (!gainNodeRef.current) {
-        gainNodeRef.current = new GainNode(audioContext, {
-          gain: localVolume,
-        });
-      }
-      if (!sourceRef.current) {
-        sourceRef.current = audioContext.createMediaStreamSource(stream);
-      }
-
-      const source = sourceRef.current;
-      const gainNode = gainNodeRef.current;
-
-      gainNode.gain.value = localVolume;
-      source.connect(gainNode).connect(audioDestination);
-
-      return () => {
-        source.disconnect();
-        gainNode.disconnect();
-      };
-    }
-  }, [
-    item,
-    audioContext,
-    audioDestination,
-    stream,
-    localVolume,
-    audioTrackCount,
-  ]);
-
-  return null;
-};
-
-interface AudioContainerProps {
-  items: TileDescriptor[];
-  audioContext: AudioContext;
-  audioDestination: AudioNode;
-}
-
-export const AudioContainer: FC<AudioContainerProps> = ({ items, ...rest }) => {
-  return (
-    <>
-      {items
-        .filter((item) => !item.isLocal)
-        .map((item) => (
-          <AudioForParticipant key={item.id} item={item} {...rest} />
-        ))}
-    </>
-  );
-};
--- a/src/video-grid/VideoGrid.stories.tsx
+++ b/src/video-grid/VideoGrid.stories.tsx
@ -21,8 +21,8 @@ import { RoomMember } from "matrix-js-sdk";
 import { VideoGrid, useVideoGridLayout } from "./VideoGrid";
 import { VideoTile } from "./VideoTile";
 import { Button } from "../button";
-import { TileDescriptor } from "../room/InCallView";
 import { ConnectionState } from "../room/useGroupCall";
+import { TileDescriptor } from "./TileDescriptor";

 export default {
  title: "VideoGrid",
--- a/src/video-grid/VideoGrid.tsx
+++ b/src/video-grid/VideoGrid.tsx
@ -23,7 +23,7 @@ import { ReactDOMAttributes } from "@use-gesture/react/dist/declarations/src/typ

 import styles from "./VideoGrid.module.css";
 import { Layout } from "../room/GridLayoutMenu";
-import { TileDescriptor } from "../room/InCallView";
+import { TileDescriptor } from "./TileDescriptor";

 interface TilePosition {
  x: number;
--- a/src/video-grid/VideoTileContainer.tsx
+++ b/src/video-grid/VideoTileContainer.tsx
@ -25,7 +25,7 @@ import { useRoomMemberName } from "./useRoomMemberName";
 import { VideoTile } from "./VideoTile";
 import { VideoTileSettingsModal } from "./VideoTileSettingsModal";
 import { useModalTriggerState } from "../Modal";
-import { TileDescriptor } from "../room/InCallView";
+import { TileDescriptor } from "./TileDescriptor";

 interface Props {
  item: TileDescriptor;
@ -72,7 +72,7 @@ export function VideoTileContainer({
    audioContext,
    audioDestination,
    localVolume,
-    isLocal || maximised
+    isLocal
  );
  const {
    modalState: videoTileSettingsModalState,
--- a/src/video-grid/useAudioOutputDevice.ts
+++ b/src/video-grid/useAudioOutputDevice.ts
@ -16,6 +16,8 @@ limitations under the License.

 import { RefObject, useEffect } from "react";

+// Uses setSinkId on an audio output element to set the device it outputs to,
+// where supported by the browser.
 export function useAudioOutputDevice(
  mediaRef: RefObject<MediaElement>,
  audioOutputDevice: string | undefined
--- a/src/video-grid/useFullscreen.tsx
+++ b/src/video-grid/useFullscreen.tsx
@ -17,8 +17,8 @@ limitations under the License.

 import { useCallback, useEffect, useState } from "react";

-import { TileDescriptor } from "../room/InCallView";
 import { useEventTarget } from "../useEvents";
+import { TileDescriptor } from "./TileDescriptor";
 import { useCallFeed } from "./useCallFeed";

 export function useFullscreen(ref: React.RefObject<HTMLElement>): {
--- a/src/video-grid/useMediaStream.ts
+++ b/src/video-grid/useMediaStream.ts
@ -15,7 +15,6 @@ limitations under the License.
 */

 import { useRef, useEffect, RefObject, useState, useCallback } from "react";
-import { parse as parseSdp, write as writeSdp } from "sdp-transform";
 import {
  acquireContext,
  releaseContext,
@ -64,6 +63,8 @@ export const useMediaStreamTrackCount = (
  return [audioTrackCount, videoTrackCount];
 };

+// Binds a media stream to a media output element, returning a ref for the
+// media element that should then be passed to the media element to be used.
 export const useMediaStream = (
  stream: MediaStream | null,
  audioOutputDevice: string | null,
@ -78,7 +79,7 @@ export const useMediaStream = (
    console.log(
      `useMediaStream update stream mediaRef.current ${!!mediaRef.current} stream ${
        stream && stream.id
-      }`
+      } muted ${mute}`
    );

    if (mediaRef.current) {
@ -127,89 +128,30 @@ export const useMediaStream = (
  return mediaRef;
 };

-// Loops the given audio stream back through a local peer connection, to make
-// AEC work with Web Audio streams on Chrome. The resulting stream should be
-// played through an audio element.
-// This hack can be removed once the following bug is resolved:
-// https://bugs.chromium.org/p/chromium/issues/detail?id=687574
-const createLoopback = async (stream: MediaStream): Promise<MediaStream> => {
-  // Prepare our local peer connections
-  const conn = new RTCPeerConnection();
-  const loopbackConn = new RTCPeerConnection();
-  const loopbackStream = new MediaStream();
-
-  conn.addEventListener("icecandidate", ({ candidate }) => {
-    if (candidate) loopbackConn.addIceCandidate(new RTCIceCandidate(candidate));
-  });
-  loopbackConn.addEventListener("icecandidate", ({ candidate }) => {
-    if (candidate) conn.addIceCandidate(new RTCIceCandidate(candidate));
-  });
-  loopbackConn.addEventListener("track", ({ track }) =>
-    loopbackStream.addTrack(track)
-  );
-
-  // Hook the connections together
-  stream.getTracks().forEach((track) => conn.addTrack(track));
-  const offer = await conn.createOffer({
-    offerToReceiveAudio: false,
-    offerToReceiveVideo: false,
-  });
-  await conn.setLocalDescription(offer);
-
-  await loopbackConn.setRemoteDescription(offer);
-  const answer = await loopbackConn.createAnswer();
-  // Rewrite SDP to be stereo and (variable) max bitrate
-  const parsedSdp = parseSdp(answer.sdp!);
-  parsedSdp.media.forEach((m) =>
-    m.fmtp.forEach(
-      (f) => (f.config += `;stereo=1;cbr=0;maxaveragebitrate=510000;`)
-    )
-  );
-  answer.sdp = writeSdp(parsedSdp);
-
-  await loopbackConn.setLocalDescription(answer);
-  await conn.setRemoteDescription(answer);
-
-  return loopbackStream;
-};
-
-export const useAudioContext = (): [
-  AudioContext,
-  AudioNode,
-  RefObject<MediaElement>
-] => {
+// Provides a properly refcounted instance of the shared audio context,
+// along with the context's destination audio node and a ref to be used
+// for the <audio> sink element.
+export const useAudioContext = (): [AudioContext, AudioNode] => {
  const context = useRef<AudioContext>();
  const destination = useRef<AudioNode>();
-  const audioRef = useRef<MediaElement>();

  useEffect(() => {
-    if (audioRef.current && !context.current) {
+    if (!context.current) {
      context.current = acquireContext();

-      if (window.chrome) {
-        // We're in Chrome, which needs a loopback hack applied to enable AEC
-        const streamDest = context.current.createMediaStreamDestination();
-        destination.current = streamDest;
-
-        const audioEl = audioRef.current;
-        (async () => {
-          audioEl.srcObject = await createLoopback(streamDest.stream);
-          await audioEl.play();
-        })();
-        return () => {
-          audioEl.srcObject = null;
-          releaseContext();
-        };
-      } else {
-        destination.current = context.current.destination;
-        return releaseContext;
-      }
+      destination.current = context.current.destination;
+      return releaseContext;
    }
  }, []);

-  return [context.current!, destination.current!, audioRef];
+  return [context.current!, destination.current!];
 };

+// Either renders a media stream with spatial audio or is just a no-op wrapper
+// around useMediaStream, depending on whether spatial audio is enabled.
+// Returns refs for the tile element from which the position is derived and
+// a <video> element to render the video to.
+// (hooks can't be conditional so we must use the same hook in each case).
 export const useSpatialMediaStream = (
  stream: MediaStream | null,
  audioContext: AudioContext,
@ -219,7 +161,12 @@ export const useSpatialMediaStream = (
 ): [RefObject<HTMLDivElement>, RefObject<MediaElement>] => {
  const tileRef = useRef<HTMLDivElement | null>(null);
  const [spatialAudio] = useSpatialAudio();
-  // We always handle audio separately form the video element
+
+  // This media stream is only used for the video - the audio goes via the audio
+  // context, so the audio output doesn't matter and the element is always muted
+  // (we could split the video out into a separate stream with just the video track
+  // and pass that as the srcObject of the element, but it seems unnecessary when we
+  // can just mute the element).
  const mediaRef = useMediaStream(stream, null, true);
  const [audioTrackCount] = useMediaStreamTrackCount(stream);