Skip to main content

Overview

The VoxNexus WebSocket API provides real-time bidirectional communication for voice services. It’s ideal for applications requiring low-latency, interactive voice processing such as:
  • Real-time voice assistants
  • Live transcription services
  • Interactive voice response (IVR) systems
  • Voice-controlled applications
  • Real-time captioning

Connection

Endpoints

  • Text-to-Speech: wss://api.voxnexus.ai/v1/tts/realtime
  • Speech-to-Text: wss://api.voxnexus.ai/v1/stt/realtime

Authentication

Authenticate using Bearer Token in the connection headers:
const ws = new WebSocket('wss://api.voxnexus.ai/v1/tts/realtime', {
  headers: {
    'Authorization': 'Bearer YOUR_API_KEY'
  }
});

Connection Lifecycle

  1. Connect: Establish WebSocket connection with authentication
  2. Initialize: Send initialization message with configuration
  3. Ready: Receive ready confirmation from server
  4. Exchange: Send/receive data messages
  5. Close: Gracefully close connection when done

Text-to-Speech WebSocket

Message Types

Client Messages

Initialization (init)
{
  "type": "init",
  "voice_id": "vl-xiaoxiao",
  "language": "zh-CN",
  "format": "mp3",
  "sample_rate": 16000,
  "speed": 1.0,
  "pitch": 0,
  "volume": 1.0,
  "ssml": false,
  "voice_config": {
    "style": "cheerful"
  }
}
Text (text)
{
  "type": "text",
  "text": "Hello, this is a test message",
  "is_final": false
}

Server Messages

Ready (ready)
{
  "type": "ready",
  "request_id": "req_1234567890",
  "voice_id": "vl-xiaoxiao",
  "language": "zh-CN",
  "format": "mp3",
  "sample_rate": 16000
}
Audio (audio)
{
  "type": "audio",
  "data": "base64-encoded-audio-data",
  "is_final": false
}
Error (error)
{
  "type": "error",
  "error": "Invalid voice_id",
  "code": "VOICE_NOT_FOUND",
  "request_id": "req_1234567890"
}

Complete Example

class TTSWebSocketClient {
  constructor(apiKey) {
    this.apiKey = apiKey;
    this.ws = null;
    this.ready = false;
  }
  
  connect() {
    return new Promise((resolve, reject) => {
      this.ws = new WebSocket('wss://api.voxnexus.ai/v1/tts/realtime', {
        headers: {
          'Authorization': `Bearer ${this.apiKey}`
        }
      });
      
      this.ws.onopen = () => {
        console.log('Connected');
        resolve();
      };
      
      this.ws.onmessage = (event) => {
        this.handleMessage(JSON.parse(event.data));
      };
      
      this.ws.onerror = (error) => {
        console.error('WebSocket error:', error);
        reject(error);
      };
      
      this.ws.onclose = () => {
        console.log('Connection closed');
        this.ready = false;
      };
    });
  }
  
  initialize(config) {
    this.ws.send(JSON.stringify({
      type: 'init',
      ...config
    }));
  }
  
  synthesize(text, isFinal = false) {
    if (!this.ready) {
      throw new Error('Not ready. Wait for ready message.');
    }
    
    this.ws.send(JSON.stringify({
      type: 'text',
      text: text,
      is_final: isFinal
    }));
  }
  
  handleMessage(message) {
    switch (message.type) {
      case 'ready':
        this.ready = true;
        console.log('Ready:', message.request_id);
        break;
        
      case 'audio':
        this.onAudio(message.data, message.is_final);
        break;
        
      case 'error':
        console.error('Error:', message.error);
        this.onError(message);
        break;
    }
  }
  
  onAudio(base64Data, isFinal) {
    // Decode and handle audio
    const audioData = atob(base64Data);
    // Play audio or save to buffer
  }
  
  onError(error) {
    // Handle error
  }
  
  close() {
    if (this.ws) {
      this.ws.close();
    }
  }
}

// Usage
const client = new TTSWebSocketClient('YOUR_API_KEY');
await client.connect();
client.initialize({
  voice_id: 'vl-xiaoxiao',
  format: 'mp3',
  sample_rate: 16000
});

// Wait for ready, then synthesize
setTimeout(() => {
  client.synthesize('Hello, world!', true);
}, 1000);

Speech-to-Text WebSocket

Message Types

Client Messages

Initialization (init)
{
  "type": "init",
  "language": "zh-CN",
  "format": "pcm",
  "sample_rate": 16000,
  "enable_timestamps": true,
  "enable_confidence": true,
  "enable_speaker_diarization": false,
  "keywords": ["keyword1"],
  "custom_vocabulary": ["custom_word"]
}
Audio (audio)
{
  "type": "audio",
  "data": "base64-encoded-audio-chunk"
}

Server Messages

Ready (ready)
{
  "type": "ready",
  "request_id": "req_1234567890",
  "language": "zh-CN",
  "format": "pcm",
  "sample_rate": 16000
}
Partial Result (partial)
{
  "type": "partial",
  "text": "Hello, this is"
}
Final Result (final)
{
  "type": "final",
  "text": "Hello, this is a test message.",
  "confidence": 0.95,
  "start_time_ms": 0,
  "end_time_ms": 2500,
  "words": [
    {
      "word": "hello",
      "start_time_ms": 0,
      "end_time_ms": 500,
      "confidence": 0.98
    }
  ],
  "speakers": [
    {
      "speaker_id": "speaker_1",
      "text": "Hello, this is a test message.",
      "start_time_ms": 0,
      "end_time_ms": 2500
    }
  ]
}
Error (error)
{
  "type": "error",
  "error": "Invalid audio format",
  "code": "UNSUPPORTED_FORMAT",
  "request_id": "req_1234567890"
}

Complete Example

class STTWebSocketClient {
  constructor(apiKey) {
    this.apiKey = apiKey;
    this.ws = null;
    this.ready = false;
    this.audioContext = null;
    this.processor = null;
  }
  
  async connect() {
    return new Promise((resolve, reject) => {
      this.ws = new WebSocket('wss://api.voxnexus.ai/v1/stt/realtime', {
        headers: {
          'Authorization': `Bearer ${this.apiKey}`
        }
      });
      
      this.ws.onopen = () => {
        console.log('Connected');
        resolve();
      };
      
      this.ws.onmessage = (event) => {
        this.handleMessage(JSON.parse(event.data));
      };
      
      this.ws.onerror = (error) => {
        console.error('WebSocket error:', error);
        reject(error);
      };
      
      this.ws.onclose = () => {
        console.log('Connection closed');
        this.ready = false;
      };
    });
  }
  
  initialize(config) {
    this.ws.send(JSON.stringify({
      type: 'init',
      ...config
    }));
  }
  
  async startRecording(sampleRate = 16000) {
    const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
    this.audioContext = new AudioContext({ sampleRate });
    const source = this.audioContext.createMediaStreamSource(stream);
    
    this.processor = this.audioContext.createScriptProcessor(4096, 1, 1);
    this.processor.onaudioprocess = (e) => {
      if (!this.ready) return;
      
      const audioData = e.inputBuffer.getChannelData(0);
      const pcm16 = new Int16Array(audioData.length);
      
      for (let i = 0; i < audioData.length; i++) {
        pcm16[i] = Math.max(-32768, Math.min(32767, audioData[i] * 32768));
      }
      
      const base64 = btoa(String.fromCharCode(...new Uint8Array(pcm16.buffer)));
      this.ws.send(JSON.stringify({
        type: 'audio',
        data: base64
      }));
    };
    
    source.connect(this.processor);
    this.processor.connect(this.audioContext.destination);
  }
  
  stopRecording() {
    if (this.processor) {
      this.processor.disconnect();
      this.processor = null;
    }
    if (this.audioContext) {
      this.audioContext.close();
      this.audioContext = null;
    }
  }
  
  handleMessage(message) {
    switch (message.type) {
      case 'ready':
        this.ready = true;
        console.log('Ready:', message.request_id);
        break;
        
      case 'partial':
        this.onPartial(message.text);
        break;
        
      case 'final':
        this.onFinal(message);
        break;
        
      case 'error':
        console.error('Error:', message.error);
        this.onError(message);
        break;
    }
  }
  
  onPartial(text) {
    console.log('Partial:', text);
    // Update UI with interim results
  }
  
  onFinal(result) {
    console.log('Final:', result.text);
    console.log('Confidence:', result.confidence);
    // Handle final transcription
  }
  
  onError(error) {
    // Handle error
  }
  
  close() {
    this.stopRecording();
    if (this.ws) {
      this.ws.close();
    }
  }
}

// Usage
const client = new STTWebSocketClient('YOUR_API_KEY');
await client.connect();
client.initialize({
  format: 'pcm',
  sample_rate: 16000,
  enable_timestamps: true,
  enable_confidence: true
});

// Wait for ready, then start recording
setTimeout(async () => {
  await client.startRecording(16000);
}, 1000);

Best Practices

Connection Management

Reconnection Logic
class ReconnectingWebSocket {
  constructor(url, options) {
    this.url = url;
    this.options = options;
    this.ws = null;
    this.reconnectAttempts = 0;
    this.maxReconnectAttempts = 5;
    this.reconnectDelay = 1000;
  }
  
  connect() {
    this.ws = new WebSocket(this.url, this.options);
    
    this.ws.onclose = () => {
      if (this.reconnectAttempts < this.maxReconnectAttempts) {
        setTimeout(() => {
          this.reconnectAttempts++;
          this.connect();
        }, this.reconnectDelay * this.reconnectAttempts);
      }
    };
    
    this.ws.onopen = () => {
      this.reconnectAttempts = 0;
    };
  }
}
Heartbeat/Ping
// Send periodic ping to keep connection alive
setInterval(() => {
  if (ws.readyState === WebSocket.OPEN) {
    ws.send(JSON.stringify({ type: 'ping' }));
  }
}, 30000); // Every 30 seconds

Error Handling

ws.onerror = (error) => {
  console.error('WebSocket error:', error);
  // Implement retry logic or notify user
};

ws.onclose = (event) => {
  if (event.code !== 1000) { // Not a normal closure
    console.error('Unexpected closure:', event.code, event.reason);
    // Attempt reconnection
  }
};

Audio Processing

Chunk Size Optimization
  • Send audio chunks of 100-200ms for optimal latency
  • Too small: Increased overhead
  • Too large: Increased latency
Buffer Management
class AudioBuffer {
  constructor() {
    this.buffer = [];
    this.chunkSize = 1600; // 100ms at 16kHz
  }
  
  addAudio(audioData) {
    this.buffer.push(...audioData);
    
    while (this.buffer.length >= this.chunkSize) {
      const chunk = this.buffer.splice(0, this.chunkSize);
      this.sendChunk(chunk);
    }
  }
  
  sendChunk(chunk) {
    const base64 = btoa(String.fromCharCode(...chunk));
    ws.send(JSON.stringify({
      type: 'audio',
      data: base64
    }));
  }
  
  flush() {
    if (this.buffer.length > 0) {
      this.sendChunk(this.buffer);
      this.buffer = [];
    }
  }
}

Performance Optimization

Batch Text Messages
// For TTS, batch multiple text segments
const texts = ['Hello', 'world', 'this', 'is', 'a', 'test'];
texts.forEach((text, index) => {
  setTimeout(() => {
    client.synthesize(text, index === texts.length - 1);
  }, index * 100);
});
Throttle Audio Sending
class ThrottledAudioSender {
  constructor(ws, interval = 100) {
    this.ws = ws;
    this.interval = interval;
    this.queue = [];
    this.timer = null;
  }
  
  send(audioData) {
    this.queue.push(audioData);
    
    if (!this.timer) {
      this.timer = setInterval(() => {
        if (this.queue.length > 0) {
          const data = this.queue.shift();
          this.ws.send(JSON.stringify({
            type: 'audio',
            data: data
          }));
        } else {
          clearInterval(this.timer);
          this.timer = null;
        }
      }, this.interval);
    }
  }
}

Common Patterns

Bidirectional Voice Conversation

// Combine TTS and STT for voice conversation
class VoiceConversation {
  constructor(apiKey) {
    this.ttsClient = new TTSWebSocketClient(apiKey);
    this.sttClient = new STTWebSocketClient(apiKey);
  }
  
  async start() {
    await Promise.all([
      this.ttsClient.connect(),
      this.sttClient.connect()
    ]);
    
    this.ttsClient.initialize({ voice_id: 'vl-xiaoxiao' });
    this.sttClient.initialize({ format: 'pcm', sample_rate: 16000 });
    
    // Handle STT results and respond with TTS
    this.sttClient.onFinal = (result) => {
      const response = this.processUserInput(result.text);
      this.ttsClient.synthesize(response, true);
    };
    
    await this.sttClient.startRecording();
  }
  
  processUserInput(text) {
    // Process user input and generate response
    return `You said: ${text}`;
  }
}

Real-time Captioning

class LiveCaptioning {
  constructor(apiKey, captionElement) {
    this.sttClient = new STTWebSocketClient(apiKey);
    this.captionElement = captionElement;
  }
  
  async start() {
    await this.sttClient.connect();
    this.sttClient.initialize({
      format: 'pcm',
      sample_rate: 16000,
      enable_timestamps: true
    });
    
    this.sttClient.onPartial = (text) => {
      this.captionElement.textContent = text;
      this.captionElement.classList.add('interim');
    };
    
    this.sttClient.onFinal = (result) => {
      this.captionElement.textContent = result.text;
      this.captionElement.classList.remove('interim');
      // Save final caption with timestamp
      this.saveCaption(result);
    };
    
    await this.sttClient.startRecording();
  }
  
  saveCaption(result) {
    // Save caption with timing information
    console.log(`[${result.start_time_ms}-${result.end_time_ms}ms] ${result.text}`);
  }
}

Troubleshooting

Connection Issues

Problem: Connection fails immediately
  • Solution: Verify API key is correct and has proper permissions
  • Solution: Check network connectivity and firewall settings
Problem: Connection drops frequently
  • Solution: Implement reconnection logic with exponential backoff
  • Solution: Check for network instability or proxy issues

Audio Issues

Problem: No audio received (TTS)
  • Solution: Verify initialization message was sent and ready message received
  • Solution: Check that text messages are being sent correctly
Problem: Recognition not working (STT)
  • Solution: Verify audio format and sample rate match initialization
  • Solution: Check that audio chunks are being sent continuously
  • Solution: Ensure audio quality is sufficient (no excessive noise)

Performance Issues

Problem: High latency
  • Solution: Reduce audio chunk size for faster processing
  • Solution: Use appropriate sample rates (16kHz is usually sufficient)
  • Solution: Optimize network connection (use closer server if available)
Problem: High memory usage
  • Solution: Process and discard audio chunks after sending
  • Solution: Limit buffer sizes for audio data
  • Solution: Close unused connections promptly