Speaker identification

Last updated: 2024-02-08

Contributors

The Speaker Identification API can be used to determine "who speaks and when" from a conversation from a media file. It is intended to be used to identify speakers who have been previously enrolled for speaker id. The Speaker Identification API segments an audio clip into a sequence of utterances, each corresponding to a unique speaker. It then attempts to determine the identity of each speaker based upon their voice signature. In cases where the speaker is ambiguous or unknown, utterances are marked with a status of UserNotIdentified.

Request parameters

To identify speakers using the Speaker Identification API, one must formulate a request using the following request parameters:

Parameter	Type	Description
`encoding`	String	Encoding of audio file like MP3, WAV etc.
`languageCode`	String	Language spoken in the audio file. Default: `en-US`
`audioType`	String	Type of the audio based on number of speakers. Allowed values are: `CallCenter` (default), `Meeting`, `EarningsCalls`, `Interview`, `PressConference`, `Voicemail`. Optional, but is useful as a hint to aid in the identification process.
`speakerIds`	List[String]	List of previously enrolled speakers to identify in the media file.
`contentUri`	String	Publicly facing URL where the media file can be accessed.
`source`	String	Source of the audio file, e.g.: `Phone`, `RingCentral`, `GoogleMeet`, `Zoom` etc

To properly identify speakers in a media file, speakers must have been previously enrolled so that their voice signature can be compared to the speakers in the audio file. It also relies on the developer having some knowledge of who the likely speakers are in the media file being processed, and providing a list of those speakers in their request.

Sample code

The following example code shows how to identify speakers from a conversation in a media file.

Follow the instructions on the quick start section to setup and run your server code before running the sample code below.

Running the code

Edit the variables in ALL CAPS with your app and user credentials before running the code.
You can only run on your production account, this means that you have to use app credentials for production.
Also make sure that you have enrolled at least one speaker id under your account.

JavaScript

const fs = require ('fs')
const RC = require('@ringcentral/sdk').SDK

// Instantiate the SDK and get the platform instance
var rcsdk = new RC({
    server: 'https://platform.ringcentral.com',
    clientId: 'RC_APP_CLIENT_ID',
    clientSecret: 'RC_APP_CLIENT_SECRET'
});
var platform = rcsdk.platform();

/* Authenticate a user using a personal JWT token */
platform.on(platform.events.loginSuccess, () => {
    NGROK_ADDRESS = "NGROK-TUNNEL-ADDRESS"
    WEBHOOK_URL = NGROK_ADDRESS + "/webhook";
    CONTENT_URI = "PUBLICLY-ACCESSIBLE-CONTENT-URI"
    speakers_identification()
})

platform.on(platform.events.loginError, function(e){
    console.log("Unable to authenticate to platform. Check credentials.", e.message)
    process.exit(1)
});

/*
* Identify speakers from a conversation
*/
async function speakers_identification() {
    let enrolledSpeakerIds = await read_enrolled_speakers()
    if (enrolledSpeakerIds.length > 0){
      try {
        let bodyParams = {
            contentUri:   CONTENT_URI,
            encoding:     "Mpeg",
            languageCode: "en-US",
            source:       "RingCentral",
            audioType:    "CallCenter",
            speakerIds:   enrolledSpeakerIds
        }
        let endpoint = `/ai/audio/v1/async/speaker-identify?webhook=${WEBHOOK_URL}`
        let resp = await platform.post(endpoint, bodyParams);
        let jsonObj = await resp.json();
        if (resp.status == 202) {
          console.log("Job ID: " + jsonObj.jobId);
          console.log("Ready to receive response at: " + WEBHOOK_URL);
        }
      } catch (e) {
          console.log(`Unable to call speaker identify API. ${e.message}`);
      }
    }else{
      console.log("No enrolled speakers. Please enroll a few speaker ids and try again.")
    }
}

/*
* Read the account enrolled speakers
*/
async function read_enrolled_speakers() {
  var enrolledSpeakerIds = []
  try{
    let queryParams = {
        partial: false,
        perPage: 100,
        page: 1
    }
    let endpoint = "/ai/audio/v1/enrollments"
    var resp = await platform.get(endpoint, queryParams)
    var jsonObj = await resp.json()
    for (var enrollment of jsonObj.records){
      enrolledSpeakerIds.push(enrollment.speakerId)
    }
  }catch (e){
    console.log("Unable to find enrolled speakers.", e.message)
  }
  return enrolledSpeakerIds
}

Python

from ringcentral import SDK
import os,sys,urllib.parse,json

NGROK_ADDRESS = "NGROK-TUNNEL-ADDRESS"
WEBHOOK_URL = NGROK_ADDRESS + "/webhook";
CONTENT_URI = "PUBLICLY-ACCESSIBLE-CONTENT-URI"

#
# Identify speakers from a conversation
#
def speakers_identification():
    enrolledSpeakerIds = read_enrolled_speakers()
    if len(enrolledSpeakerIds) > 0:
        try:
            bodyParams = {
              'contentUri': CONTENT_URI,
              'encoding': "Mpeg",
              'languageCode': "en-US",
              'source': "RingCentral",
              'audioType': "CallCenter",
              'speakerIds': enrolledSpeakerIds
            }
            endpoint = f'/ai/audio/v1/async/speaker-identify?webhook={urllib.parse.quote(WEBHOOK_URL)}'
            resp = platform.post(endpoint, bodyParams)
            jsonObj = resp.json()
            if resp.response().status_code == 202:
                print(f'Job ID: {resp.json().jobId}')
                print(f'Ready to receive response at: {WEBHOOK_URL}')
        except Exception as e:
          print ("Unable to call speaker identify API. " + str(e))
    else:
        print("No enrolled speakers. Please enroll a few speaker ids and try again.")

#
# Read the account enrolled speakers
#
def read_enrolled_speakers():
    enrolledSpeakerIds = []
    try:
        queryParams = {
            'partial': False,
            'perPage': 100,
            'page': 1
        }
        endpoint = "/ai/audio/v1/enrollments"
        resp = platform.get(endpoint, queryParams)
        jsonObj = resp.json_dict()
        for enrollment in jsonObj['records']:
            enrolledSpeakerIds.append(enrollment['speakerId'])
    except Exception as e:
        print ("Unable to find enrolled speakers. " + str(e))

    return enrolledSpeakerIds


# Authenticate a user using a personal JWT token
def login():
  try:
      platform.login( jwt= "RC_USER_JWT" )
      speakers_identification()
  except Exception as e:
      print ("Unable to authenticate to platform. Check credentials. " + str(e))

# Instantiate the SDK and get the platform instance
rcsdk = SDK("RC_APP_CLIENT_ID", "RC_APP_CLIENT_SECRET", "https://platform.ringcentral.com")
platform = rcsdk.platform()

login()

PHP

<?php
require('vendor/autoload.php');

// Instantiate the SDK and get the platform instance
$rcsdk = new RingCentral\SDK\SDK( 'RC_APP_CLIENT_ID', 'RC_APP_CLIENT_SECRET', 'https://platform.ringcentral.com' );
$platform = $rcsdk->platform();

/* Authenticate a user using a personal JWT token */
$platform->login(["jwt" => 'RC_USER_JWT']);
$NGROK_ADDRESS = "NGROK-TUNNEL-ADDRESS";
$WEBHOOK_URL = $NGROK_ADDRESS . "/webhook";
$CONTENT_URI = "PUBLICLY-ACCESSIBLE-CONTENT-URI";

speakers_identification();

/*
* Identify speakers from a conversation
*/
function speakers_identification()
{
  global $platform, $WEBHOOK_URL, $CONTENT_URI;
  $enrolledSpeakerIds = read_enrolled_speakers();
  if (count($enrolledSpeakerIds) > 0) {
    try {

      $bodyParams = array (
          'contentUri' =>  $CONTENT_URI,
          'encoding' => "Mpeg",
          'languageCode' =>  "en-US",
          'source' => "RingCentral",
          'audioType' =>  "CallCenter",
          'speakerIds' => $enrolledSpeakerIds
      );
      $endpoint = "/ai/audio/v1/async/speaker-identify?webhook=" . urlencode($WEBHOOK_URL);
      $resp = $platform->post($endpoint, $bodyParams);
      $jsonObj = $resp->json();
      if ($resp->response()->getStatusCode() == 202) {
        print_r ("Job ID: " . $jsonObj->jobId . PHP_EOL);
        print_r("Ready to receive response at: " . $WEBHOOK_URL . PHP_EOL);
      }
    }catch (\RingCentral\SDK\Http\ApiException $e) {
      print_r ('Unable to call speaker identify API. ' . $e->getMessage() . PHP_EOL);
    }
  }else{
    print_r ('No enrolled speakers. Please enroll a few speaker ids and try again.' . PHP_EOL);
  }
}

/*
* Read the account enrolled speakers
*/
function read_enrolled_speakers() {
  global $platform;
  $enrolledSpeakerIds = [];
  try{
    $queryParams = array (
        'partial' => false,
        'perPage' => 100,
        'page' => 1
    );
    $endpoint = "/ai/audio/v1/enrollments";
    $resp = $platform->get($endpoint, $queryParams);
    $jsonObj = $resp->json();
    foreach ($jsonObj->records as $enrollment) {
      array_push($enrolledSpeakerIds, $enrollment->speakerId);
    }
  }catch (\RingCentral\SDK\Http\ApiException $e) {
    print_r ('Unable to find enrolled speakers. ' . $e->getMessage() . PHP_EOL);
  }
  return $enrolledSpeakerIds;
}
?>

Ruby

require 'ringcentral'

NGROK_ADDRESS = "NGROK-TUNNEL-ADDRESS"
WEBHOOK_URL = NGROK_ADDRESS + "/webhook";
CONTENT_URI = "PUBLICLY-ACCESSIBLE-CONTENT-URI"

#
# Identify speakers from a conversation
#
def speakers_identification()
    enrolledSpeakerIds = read_enrolled_speakers()
    if enrolledSpeakerIds.length > 0
      bodyParams = {
          'contentUri': CONTENT_URI,
          'encoding': "Mpeg",
          'languageCode': "en-US",
          'source': "RingCentral",
          'audioType': "CallCenter",
          'speakerIds': enrolledSpeakerIds
      }
      queryParams = {
        'webhook': WEBHOOK_URL
      }
      endpoint = "/ai/audio/v1/async/speaker-identify"
      begin
        resp = $platform.post(endpoint, payload: bodyParams, params: queryParams)
        body = resp.body
        if resp.status == 202
            puts('Job ID: ' + body['jobId']);
            puts ('Ready to receive response at: ' + WEBHOOK_URL);
        end
      rescue StandardError => e
        puts ("Unable to call speaker identify API. " + e.to_s)
      end
    else
      puts ("No enrolled speakers. Please enroll a few speaker ids and try again.")
    end
end

#
# Read the account enrolled speakers
#
def read_enrolled_speakers()
    enrolledSpeakerIds = []
    begin
      queryParams = {
            'partial': false,
            'perPage': 100,
            'page': 1
      }
      endpoint = "/ai/audio/v1/enrollments"
      resp = $platform.get(endpoint, queryParams)
      jsonObj = resp.body
      for enrollment in jsonObj['records'] do
        enrolledSpeakerIds.append(enrollment['speakerId'])
      end
    rescue StandardError => e
      puts  ("Unable to find enrolled speakers. " + e.to_s)
    end
    return enrolledSpeakerIds
end


# Authenticate a user using a personal JWT token
def login()
  begin
    $platform.authorize( jwt: "RC_USER_JWT" )
    speakers_identification()
  rescue StandardError => e
    puts ("Unable to authenticate to platform. Check credentials. " + e.to_s)
  end
end

# Instantiate the SDK and get the platform instance
$platform = RingCentral.new( "RC_APP_CLIENT_ID", "RC_APP_CLIENT_SECRET", "https://platform.ringcentral.com" )

login()

using System;
using System.IO;
using System.Threading.Tasks;
using System.Collections.Generic;
using RingCentral;
using Newtonsoft.Json;

namespace SpeakersIdentification {
  class Program {
    static RestClient restClient;
    static string NGROK_ADDRESS = "NGROK-TUNNEL-ADDRESS";
    static string WEBHOOK_URL = NGROK_ADDRESS + "/webhook";
    static string CONTENT_URI = "PUBLICLY-ACCESSIBLE-CONTENT-URI";

    static async Task Main(string[] args){
      try
      {
        // Instantiate the SDK
        restClient = new RestClient("RC_APP_CLIENT_ID", "RC_APP_CLIENT_SECRET", "https://platform.ringcentral.com");

        // Authenticate a user using a personal JWT token
        await restClient.Authorize("RC_USER_JWT");

        await speakers_identification();
      }
      catch (Exception ex)
      {
        Console.WriteLine("Unable to authenticate to platform. Check credentials. " + ex.Message);
      }
    }
    /*
    * Identify spoeakers from a conversation
    */
    static private async Task speakers_identification()
    {
      var enrolledSpeakerIds = await read_enrolled_speakers();
      if (enrolledSpeakerIds.Length > 0)
      {
        try
        {
          var bodyParams = new IdentifyInput()
          {
            contentUri = CONTENT_URI,
            encoding = "Wav",
            languageCode = "en-US",
            source = "RingCentral",
            audioType = "CallCenter",
            enableVoiceActivityDetection = true,
            speakerIds = enrolledSpeakerIds
          };

          var queryParams = new CaiSpeakerIdentifyParameters() { webhook = WEBHOOK_URL };
          var resp = await restClient.Ai().Audio().V1().Async().SpeakerIdentify().Post(bodyParams, queryParams);
          Console.WriteLine("Job ID: " + resp.jobId);
          Console.WriteLine("Ready to receive response at: " + WEBHOOK_URL);
        }
        catch (Exception ex)
        {
          Console.WriteLine("Unable to call speaker identify API. " + ex.Message);
        }
      }
      else
      {
        Console.WriteLine("No enrolled speakers. Please enroll a few speaker ids and try again.");
      }
    }

    /*
    * Read the account enrolled speakers
    */
    static private async Task<String[]> read_enrolled_speakers()
    {
      List<String> enrolledSpeakerIds = new List<String>();
      try
      {
        var queryParams = new CaiEnrollmentsListParameters();
        queryParams.partial = false;
        queryParams.perPage = 100;
        queryParams.page = 1;
        var resp = await restClient.Ai().Audio().V1().Enrollments().List(queryParams);
        foreach (var enrollment in resp.records)
        {
          enrolledSpeakerIds.Add(enrollment.speakerId);
        }
      }
      catch (Exception ex)
      {
        Console.WriteLine("Unable to find enrolled speakers. " + ex.Message);

      }
      return enrolledSpeakerIds.ToArray();
    }
  }
}

Java

package SpeakersIdentification;

import java.io.IOException;
import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;

import com.ringcentral.*;
import com.ringcentral.definitions.*;

public class SpeakersIdentification {
    String NGROK_ADDRESS = "NGROK-TUNNEL-ADDRESS";
    String WEBHOOK_URL = NGROK_ADDRESS + "/webhook";
    String CONTENT_URI = "PUBLICLY-ACCESSIBLE-CONTENT-URI";

    static RestClient restClient;

    public static void main(String[] args) {
      var obj = new AnalyzeInteraction();
      try {
        // Instantiate the SDK
        restClient = new RestClient("RC_APP_CLIENT_ID", "RC_APP_CLIENT_SECRET", "https://platform.ringcentral.com");

        // Authenticate a user using a personal JWT token
        restClient.authorize("RC_USER_JWT");

        obj.speakers_identification();

      } catch (RestException e) {
        System.out.println(e.getMessage());
      } catch (IOException e) {
        e.printStackTrace();
      }
    }
    /*
    * Identify speakers from a conversation
    */
    static private void speakers_identification() throws RestException, IOException {
        var enrolledSpeakerIds = read_enrolled_speakers();
        if (enrolledSpeakerIds.length > 0) {
            try
            {
                var bodyParams = new IdentifyInput()
                    .contentUri(CONTENT_URI)
                    .encoding("Wav")
                    .languageCode("en-US")
                    .source("RingCentral")
                    .audioType("CallCenter")
                    .enableVoiceActivityDetection(true)
                    .speakerIds(enrolledSpeakerIds);

                var queryParams = new CaiSpeakerIdentifyParameters().webhook(WEBHOOK_URL);
                var resp = restClient.ai().audio().v1().async().speakerIdentify().post(bodyParams, queryParams);
                System.out.println("Job ID: " + resp.jobId);
                System.out.println("Ready to receive response at: " + WEBHOOK_URL);
            }catch (RestException ex) {
                System.out.println("Unable to call speaker identify API. " + ex.getMessage());
            }
        } else {
            System.out.println("No enrolled speakers. Please enroll a few speaker ids and try again.");
        }
    }

    /*
    * Read the account enrolled speakers
    */
    static private String[] read_enrolled_speakers() throws RestException, IOException {
        ArrayList<String> speakerIdList = new ArrayList<>();
        try {
            var queryParams = new CaiEnrollmentsListParameters()
                .partial(true)
                .perPage(100l)
                .page(1l);

            var resp = restClient.ai().audio().v1().enrollments().list(queryParams);
            for (var enrollment : resp.records) {
                speakerIdList.add(enrollment.speakerId);
            }
        } catch (RestException ex) {
            System.out.println("Unable to find enrolled speakers. " + ex.getMessage());
        }
        String[] enrolledSpeakerIds = speakerIdList.toArray(new String[speakerIdList.size()]);
        return enrolledSpeakerIds;
    }
}

Sample response

If your speaker identification request is processed successfully, the response payload will resemble the following:

{
    "jobId": "4aa767d8-a4f7-11ee-ac8c-0050568c3c69",
    "api": "/ai/audio/v1/async/speaker-identify",
    "creationTime": "2023-12-27T20:34:11.287Z",
    "completionTime": "2023-12-27T20:34:13.016Z",
    "expirationTime": "2024-01-03T20:34:11.287Z",
    "status": "Success",
    "response": {
      "utterances": [
          {
              "start": 0,
              "end": 6,
              "speakerId": "62295327016",
              "confidence": 35.42294
          },
          {
              "start": 6,
              "end": 12,
              "speakerId": "62285329012",
              "confidence": 36.98796
          },
          {
              "start": 12,
              "end": 18.0,
              "speakerId": "62294637102",
              "confidence": 25.51731
          }
        ]
    }
}

Each utterance contains the following information and parameters:

Parameter	Type	Description
`speakerId`	String	Speaker id of the identified speaker.
`start`	Float	Start of the audio segment.
`end`	Float	End of the audio segment.
`confidence`	Float	Confidence score of speaker identification.