ti-pote/apps/robot-client/scripts/audio-loopback.ts

/**
 * Ti-Pote — End-to-end audio loopback test.
 *
 * What it proves: the whole Pi ↔ ESP32 ↔ mic/speaker chain works,
 * without bringing the cloud/wake-word/orchestrator into the picture.
 *
 * What it does:
 *   1. Opens the serial link to the ESP32.
 *   2. Captures `CAPTURE_MS` (default 5000) of mic audio via
 *      AUDIO_UP frames into a single in-memory buffer.
 *   3. Pauses briefly.
 *   4. Streams that buffer back to the ESP32 as AUDIO_DOWN frames
 *      and waits for the speaker to finish playing.
 *
 * Expected result: you say "allô allô" during step 2 and hear your
 * own voice played back on the robot's speaker a moment later.
 *
 * Run with:
 *   HARDWARE_SERIAL_PORT=/dev/serial0 pnpm --filter @ti-pote/robot-client audio:loopback
 *
 * Optional env:
 *   CAPTURE_MS        — capture duration in ms (default 5000)
 *   HARDWARE_SERIAL_PORT / HARDWARE_SERIAL_BAUD
 */

import { writeFileSync } from 'node:fs';
import { HardwareService, Emotion } from '../src/hardware/index.js';
import { Esp32AudioService } from '../src/services/audio.service.js';

const path = process.env.HARDWARE_SERIAL_PORT ?? '/dev/serial0';
const baudRate = parseInt(process.env.HARDWARE_SERIAL_BAUD ?? '921600', 10);
const captureMs = parseInt(process.env.CAPTURE_MS ?? '5000', 10);
const debug = !!process.env.DEBUG;
const dumpPath = process.env.DUMP_PATH ?? '/tmp/tipote-capture.raw';
const skipPlayback = !!process.env.SKIP_PLAYBACK;

const SAMPLE_RATE = 16000;
const BYTES_PER_SAMPLE = 2;

let debugFramesSeen = 0;

async function sleep(ms: number): Promise<void> {
  return new Promise((r) => setTimeout(r, ms));
}

async function main(): Promise<void> {
  const hw = new HardwareService({ path, baudRate, heartbeatIntervalMs: 1000 });
  hw.on('log', (line) => console.log(`[firmware] ${line}`));
  hw.on('error', (err) => console.error(`[firmware error] ${err.message}`));
  if (debug) {
    hw.on('audio_up', (chunk) => {
      // Print first 8 int16 samples of the first few frames
      // so we can see whether the wire carries zeros or real data.
      if (debugFramesSeen < 3) {
        const head: number[] = [];
        for (let i = 0; i < Math.min(chunk.length, 16); i += 2) {
          head.push(chunk.readInt16LE(i));
        }
        console.log(`[debug] frame ${debugFramesSeen} len=${chunk.length} head=${head.join(',')}`);
        debugFramesSeen++;
      }
    });
  }

  console.log(`→ opening ${path} @ ${baudRate} baud`);
  await hw.connect();

  try {
    const rtt = await hw.ping(Buffer.from('loopback'));
    console.log(`→ ping round-trip: ${rtt.toFixed(1)} ms`);

    hw.sendEmotion(Emotion.SURPRISED);

    // ── 1. Capture ────────────────────────────────────────────────
    const chunks: Buffer[] = [];
    let bytesCaptured = 0;

    const collect = (chunk: Buffer): void => {
      chunks.push(chunk);
      bytesCaptured += chunk.length;
    };
    hw.on('audio_up', collect);

    console.log(`🎙️  Recording ${captureMs} ms — say something!`);
    await sleep(captureMs);

    hw.off('audio_up', collect);
    const capture = Buffer.concat(chunks);
    const samples = capture.length / BYTES_PER_SAMPLE;
    const durationMs = (samples / SAMPLE_RATE) * 1000;
    console.log(
      `✅ captured ${capture.length} bytes (${samples} samples, ${durationMs.toFixed(0)} ms)` +
        ` across ${chunks.length} frames`,
    );

    if (capture.length === 0) {
      console.error(
        '❌ no audio received from the ESP32. Check the I2S wiring ' +
          '(BCLK=32, LRCLK=33, DIN=34) and that the firmware got past `audio: I2S ready`.',
      );
      return;
    }

    // Quick RMS sanity check so we catch "mic muted" / "disconnected" early.
    const rms = computeRms(capture);
    console.log(`   RMS level: ${rms.toFixed(0)} (silence ≈ 10, speech ≳ 500)`);

    if (debug) {
      // Dump the raw capture so we can replay it offline:
      //   aplay -r 16000 -f S16_LE -c 1 /tmp/tipote-capture.raw
      writeFileSync(dumpPath, capture);
      console.log(`[debug] raw capture written to ${dumpPath} (${capture.length} bytes)`);

      const allZero = capture.every((b) => b === 0);
      console.log(`[debug] capture.allZero=${allZero}`);

      // Also print some distinct int16 values we saw, to spot patterns.
      const seen = new Set<number>();
      for (let i = 0; i < capture.length - 1 && seen.size < 10; i += 2) {
        seen.add(capture.readInt16LE(i));
      }
      console.log(`[debug] first distinct samples: ${[...seen].join(',')}`);
    }

    if (skipPlayback) {
      console.log('SKIP_PLAYBACK set — not sending AUDIO_DOWN');
      return;
    }

    // ── 2. Playback ───────────────────────────────────────────────
    await sleep(500);

    const audio = new Esp32AudioService(
      {
        backend: 'esp32',
        captureDevice: 'default',
        playbackDevice: 'default',
        sampleRate: SAMPLE_RATE,
        bitDepth: 16,
        channels: 1,
        chunkDurationMs: 20,
      },
      hw,
    );

    hw.sendEmotion(Emotion.HAPPY);
    console.log('🔊 Playing back on the ESP32 speaker...');
    await audio.play(capture);
    console.log('✅ playback done');
  } finally {
    hw.sendEmotion(Emotion.NEUTRAL);
    await sleep(200);
    await hw.disconnect();
  }
}

function computeRms(buf: Buffer): number {
  if (buf.length < 2) return 0;
  let sumSquares = 0;
  const samples = buf.length / 2;
  for (let i = 0; i < buf.length - 1; i += 2) {
    const s = buf.readInt16LE(i);
    sumSquares += s * s;
  }
  return Math.sqrt(sumSquares / samples);
}

main().catch((err) => {
  console.error('loopback failed:', err);
  process.exit(1);
});