Speaker diarization

Last updated: 2024-02-08

Contributors

Speaker diarization is the process that partitions audio stream into homogenous segments according to the speaker identity. It solves the problem of "Who Speaks When". This API splits audio clip into speech segments and tags them with speakers ids accordingly. This API also supports speaker identification by speaker ID if the speaker was already enrolled using Speaker Enrollment API.

Using the diarization API

For the best results we recommend following these guidelines.

The audioType parameter provides the system with a hint about the nature of the meeting which helps improve accuracy. We recommend setting this parameter to CallCenter when there are 2-3 speakers expected to be identified and Meeting when 4-6 speakers are expected.
Set the enableVoiceActivityDetection parameter to True if you want silence and noise segments removed from the diarization output. We suggest you to set it to True in most circumstances.
Setting the source parameter helps to optimize the diarization process by allowing a specialized acoustic model built specifically for the corresponding audio sources.
For proper speaker indentification, make sure you have previously enrolled all speakers in the media file and include them in the speakerIds parameter.

Request parameters

Parameter	Type	Description
`encoding`	String	Encoding of audio file like MP3, WAV etc.
`languageCode`	String	Language spoken in the audio file. Default of "en-US".
`separateSpeakerPerChannel`	Boolean	Set to True if the input audio is multi-channel and each channel has a separate speaker. Optional. Default of False.
`speakerCount`	Number	Number of speakers in the file. Optional.
`audioType`	String	Type of the audio based on number of speakers. Optional. Permitted values: `CallCenter` (default), `Meeting`, `EarningsCalls`, `Interview`, `PressConference`
`speakerIds`	List[String]	Optional set of speakers to be identified from the call. Optional.
`contentUri`	String	Publicly facing url.
`source`	String	Source of the audio file eg: `Phone`, `RingCentral`, `GoogleMeet`, `Zoom` etc. Optional.
`enableVoiceActivityDetection`	Boolean	Apply voice activity detection. Optional. Default of False.

Sample code

The following example code shows how to detect speakers from a conversation in a media file.

Follow the instructions on the quick start section to setup and run your server code before running the sample code below.

Running the code

Edit the variables in ALL CAPS with your app and user credentials before running the code.
You can only run on your production account, this means that you have to use app credentials for production.

JavaScript

const fs = require ('fs')
const RC = require('@ringcentral/sdk').SDK

// Instantiate the SDK and get the platform instance
var rcsdk = new RC({
    server: 'https://platform.ringcentral.com',
    clientId: 'RC_APP_CLIENT_ID',
    clientSecret: 'RC_APP_CLIENT_SECRET'
});
var platform = rcsdk.platform();

/* Authenticate a user using a personal JWT token */
platform.on(platform.events.loginSuccess, () => {
    NGROK_ADDRESS = "NGROK-TUNNEL-ADDRESS"
    WEBHOOK_URL = NGROK_ADDRESS + "/webhook";
    CONTENT_URI = "PUBLICLY-ACCESSIBLE-CONTENT-URI"
    speakers_detection()
})

platform.on(platform.events.loginError, function(e){
    console.log("Unable to authenticate to platform. Check credentials.", e.message)
    process.exit(1)
});

/*
* Detect speakers from a conversation
*/
async function speakers_detection() {
    try {
        let bodyParams = {
            contentUri:   CONTENT_URI,
            encoding:     "Mpeg",
            languageCode: "en-US",
            source:       "RingCentral",
            audioType:    "CallCenter"
        }
        let endpoint = `/ai/audio/v1/async/speaker-diarize?webhook=${WEBHOOK_URL}`
        let resp = await platform.post(endpoint, bodyParams);
        let jsonObj = await resp.json();
        if (resp.status == 202) {
          console.log("Job ID: " + jsonObj.jobId);
          console.log("Ready to receive response at: " + WEBHOOK_URL);
        }
    } catch (e) {
          console.log(`Unable to call speaker diarization API. ${e.message}`);
    }
}

Python

from ringcentral import SDK
import os,sys,urllib.parse,json

NGROK_ADDRESS = "NGROK-TUNNEL-ADDRESS"
WEBHOOK_URL = NGROK_ADDRESS + "/webhook";
CONTENT_URI = "PUBLICLY-ACCESSIBLE-CONTENT-URI"

#
# Detect speakers from a conversation
#
def speakers_detection():
    try:
        bodyParams = {
        'contentUri': CONTENT_URI,
        'encoding': "Mpeg",
        'languageCode': "en-US",
        'source': "RingCentral",
        'audioType': "CallCenter"
        }
        endpoint = f'/ai/audio/v1/async/speaker-diarize?webhook={urllib.parse.quote(WEBHOOK_URL)}'
        resp = platform.post(endpoint, bodyParams)
        jsonObj = resp.json()
        if resp.response().status_code == 202:
            print(f'Job ID: {resp.json().jobId}')
            print(f'Ready to receive response at: {WEBHOOK_URL}')
    except Exception as e:
        print ("Unable to call speaker diarization API. " + str(e))


# Authenticate a user using a personal JWT token
def login():
  try:
      platform.login( jwt= "RC_USER_JWT" )
      speakers_detection()
  except Exception as e:
      print ("Unable to authenticate to platform. Check credentials. " + str(e))

# Instantiate the SDK and get the platform instance
rcsdk = SDK("RC_APP_CLIENT_ID", "RC_APP_CLIENT_SECRET", "https://platform.ringcentral.com")
platform = rcsdk.platform()

login()

PHP

<?php
require('vendor/autoload.php');

// Instantiate the SDK and get the platform instance
$rcsdk = new RingCentral\SDK\SDK( 'RC_APP_CLIENT_ID', 'RC_APP_CLIENT_SECRET', 'https://platform.ringcentral.com' );
$platform = $rcsdk->platform();

/* Authenticate a user using a personal JWT token */
$platform->login(["jwt" => 'RC_USER_JWT']);
$NGROK_ADDRESS = "NGROK-TUNNEL-ADDRESS";
$WEBHOOK_URL = $NGROK_ADDRESS . "/webhook";
$CONTENT_URI = "PUBLICLY-ACCESSIBLE-CONTENT-URI";

speakers_detection();

/*
* Detect speakers from a conversation
*/
function speakers_detection() {
  global $platform, $WEBHOOK_URL, $CONTENT_URI;
    try {

      $bodyParams = array (
          'contentUri' =>  $CONTENT_URI,
          'encoding' => "Mpeg",
          'languageCode' =>  "en-US",
          'source' => "RingCentral",
          'audioType' =>  "CallCenter"
      );
      $endpoint = "/ai/audio/v1/async/speaker-diarize?webhook=" . urlencode($WEBHOOK_URL);
      $resp = $platform->post($endpoint, $bodyParams);
      $jsonObj = $resp->json();
      if ($resp->response()->getStatusCode() == 202) {
        print_r ("Job ID: " . $jsonObj->jobId . PHP_EOL);
        print_r("Ready to receive response at: " . $WEBHOOK_URL . PHP_EOL);
      }
    }catch (\RingCentral\SDK\Http\ApiException $e) {
      print_r ('Unable to call speaker diarization API. ' . $e->getMessage() . PHP_EOL);
    }
}
?>

Ruby

require 'ringcentral'

NGROK_ADDRESS = "NGROK-TUNNEL-ADDRESS"
WEBHOOK_URL = NGROK_ADDRESS + "/webhook";
CONTENT_URI = "PUBLICLY-ACCESSIBLE-CONTENT-URI"

#
# Detect speakers from a conversation
#
def speakers_detection()
    bodyParams = {
      'contentUri': CONTENT_URI,
      'encoding': "Mpeg",
      'languageCode': "en-US",
      'source': "RingCentral",
      'audioType': "CallCenter"
    }
    queryParams = {
      'webhook': WEBHOOK_URL
    }
    endpoint = "/ai/audio/v1/async/speaker-diarize"
    begin
        resp = $platform.post(endpoint, payload: bodyParams, params: queryParams)
        body = resp.body
        if resp.status == 202
          puts('Job ID: ' + body['jobId']);
          puts ('Ready to receive response at: ' + WEBHOOK_URL);
        end
    rescue StandardError => e
        puts ("Unable to call speaker diarization API. " + e.to_s)
    end
end

# Authenticate a user using a personal JWT token
def login()
  begin
    $platform.authorize( jwt: "RC_USER_JWT" )
    speakers_detection()
  rescue StandardError => e
    puts ("Unable to authenticate to platform. Check credentials. " + e.to_s)
  end
end

# Instantiate the SDK and get the platform instance
$platform = RingCentral.new( "RC_APP_CLIENT_ID", "RC_APP_CLIENT_SECRET", "https://platform.ringcentral.com" )

login()

using System;
using System.IO;
using System.Threading.Tasks;
using System.Collections.Generic;
using RingCentral;
using Newtonsoft.Json;

namespace SpeakersDiarization {
  class Program {
    static RestClient restClient;
    static string NGROK_ADDRESS = "NGROK-TUNNEL-ADDRESS";
    static string WEBHOOK_URL = NGROK_ADDRESS + "/webhook";
    static string CONTENT_URI = "PUBLICLY-ACCESSIBLE-CONTENT-URI";

    static async Task Main(string[] args){
      try
      {
        // Instantiate the SDK
        restClient = new RestClient("RC_APP_CLIENT_ID", "RC_APP_CLIENT_SECRET", "https://platform.ringcentral.com");

        // Authenticate a user using a personal JWT token
        await restClient.Authorize("RC_USER_JWT");

        await speakers_detection();
      }
      catch (Exception ex)
      {
        Console.WriteLine("Unable to authenticate to platform. Check credentials. " + ex.Message);
      }
    }
    /*
    * Detect spoeakers from a conversation
    */
    static private async Task speakers_detection()
    {
      try
      {
        var bodyParams = new DiarizeInput()
        {
          contentUri = CONTENT_URI,
          encoding = "Wav",
          languageCode = "en-US",
          source = "RingCentral",
          audioType = "CallCenter"
        };
        var callbackAddress = Environment.GetEnvironmentVariable("NGROK_URL") + "/webhook";
        var queryParams = new CaiSpeakerDiarizeParameters() { webhook = callbackAddress };

        var resp = await restClient.Ai().Audio().V1().Async().SpeakerDiarize().Post(bodyParams, queryParams);
        Console.WriteLine("Job ID: " + resp.jobId);
        Console.WriteLine("Ready to receive response at: " + callbackAddress);
      }
      catch (Exception ex)
      {
        Console.WriteLine("Unable to call speaker diarization API. " + ex.Message);
      }
    }
  }
}

Java

package SpeakersDiarization;

import java.io.IOException;
import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;

import com.ringcentral.*;
import com.ringcentral.definitions.*;

public class SpeakersDiarization {
    String NGROK_ADDRESS = "NGROK-TUNNEL-ADDRESS";
    String WEBHOOK_URL = NGROK_ADDRESS + "/webhook";
    String CONTENT_URI = "PUBLICLY-ACCESSIBLE-CONTENT-URI";

    static RestClient restClient;

    public static void main(String[] args) {
      var obj = new SpeakersDiarization();
      try {
        // Instantiate the SDK
        restClient = new RestClient("RC_APP_CLIENT_ID", "RC_APP_CLIENT_SECRET", "https://platform.ringcentral.com");

        // Authenticate a user using a personal JWT token
        restClient.authorize("RC_USER_JWT");

        obj.speakers_detection();

      } catch (RestException e) {
        System.out.println(e.getMessage());
      } catch (IOException e) {
        e.printStackTrace();
      }
    }
    /*
    * Detect speakers from a conversation
    */
    static private void speakers_detection() throws RestException, IOException {
      try {
        var bodyParams = new DiarizeInput()
        .contentUri(CONTENT_URI)
        .encoding("Mpeg")
        .languageCode("en-US")
        .source("RingCentral")
        .audioType("CallCenter");

        var callbackAddress = System.getenv("WEBHOOK_URL") + "/webhook";
        var queryParams = new CaiSpeakerDiarizeParameters().webhook(callbackAddress);
        var resp = restClient.ai().audio().v1().async().speakerDiarize().post(bodyParams, queryParams);
        System.out.println("Job ID: " + resp.jobId);
        System.out.println("Ready to receive response at: " + callbackAddress);
      }catch (RestException ex) {
        System.out.println("Unable to call speaker diarization API. " + ex.getMessage());
      }
    }
}

Sample response

{
    "status": "Success",
    "response": {
      "speakerCount": 2,
      "utterances": [
        {
          "speakerId": "JohnDoe",
          "start": 0.3,
          "end": 5.1,
          "confidence": 0.97
        },
        ...
      ]
    }
}