Create transcription

Create transcriptions

Transcribes audio files into text using relaxAI’s advanced speech-to-text models. This endpoint supports various audio formats and provides options for customization, including language selection and speaker diarization.

POST https://api.relax.ai/v1/audio/transcriptions

Example Request

OpenAI SDKs

from openai import OpenAI

client = OpenAI(
    api_key = RELAX_API_KEY,
    base_url = 'https://api.relax.ai/v1/',
)

audio_file = open("speech.mp3", "rb")
transcript = client.audio.transcriptions.create(
    model="Voxtral-Small-24B-2507",
    file=audio_file
)

print(transcript.text)

import { OpenAI } from "openai";

const openai = new OpenAI({
    apiKey: RELAX_API_KEY,
    baseUrl: 'https://api.relax.ai/v1/'
});

async function main() {
    const transcription = await openai.audio.transcriptions.create({
      file: fs.createReadStream("audio.mp3"),
      model: "Voxtral-Small-24B-2507",
    });

    console.log(transcription.text);
}
main();

  curl https://api.relax.ai/v1/chat/completions \
  -H "Authorization: Bearer $RELAX_API_KEY" \
  -H "Content-Type: multipart/form-data" \
  -F file="@/path/to/file/audio.mp3" \
  -F model="Voxtral-Small-24B-2507"

Response

Returns a 200 OK response code with a JSON object containing the transcription details.

Transcription Response

{
  "text": "In this video, we will explore conversation dialogues between two friends...",
  "logprobs": null,
  "usage": {
    "input_tokens": 3000,
    "output_tokens": 111,
    "total_tokens": 3111,
    "type": "duration",
    "input_token_details": {
      "audio_tokens": 3000,
      "text_tokens": 111
    },
    "seconds": 60
  },
  "duration": 0,
  "language": "",
  "segments": null,
  "words": null
}

Request Body

The following parameters can be included in the request body:

Parameter	Type	Required	Description
file	file	Yes	The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
model	string	Yes	ID of the model to use. The options are Voxtral-Small-24B-2507.
known_speaker_names	array	No	Optional list of speaker names that correspond to the audio samples provided in known_speaker_references[]. Each entry should be a short identifier (for example customer or agent). Up to 4 speakers are supported.
known_speaker_references	array	No	Optional list of audio samples (as data URLs) that contain known speaker references matching known_speaker_names[]. Each sample must be between 2 and 10 seconds, and can use any of the same input audio formats supported by file.
language	string	No	The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency.
prompt	string	No	An optional text to guide the model’s style or continue a previous audio segment. The prompt should match the audio language. The prompt parameter is not supported when using gpt-4o-transcribe-diarize.
response_format	string	No	The format of the output, in one of the following options: json, text, srt, verbose_json, vtt, or diarized_json. Default: json
stream	boolean	No	If set to true, the model response data will be streamed to the client as it is generated using server-sent events. See the Streaming section of the Speech-to-Text guide for more information. Default: false
temperature	number	No	The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. Default: 0
timestamp_granularities	array	No	The timestamp granularities to populate for this transcription. response_format must be set to verbose_json to use timestamp granularities. Currently only word is supported.

import base64
from openai import OpenAI

client = OpenAI(
  api_key = RELAX_API_KEY,
  base_url = 'https://api.relax.ai/v1/',
)

def to_data_url(path: str) -> str:
  with open(path, "rb") as fh:
    return "data:audio/wav;base64," + base64.b64encode(fh.read()).decode("utf-8")

with open("meeting.wav", "rb") as audio_file:
  transcript = client.audio.transcriptions.create(
    model="Voxtral-Small-24B-2507",
    file=audio_file,
    response_format="diarized_json",
    chunking_strategy="auto",
    extra_body={
      "known_speaker_names": ["agent"],
      "known_speaker_references": [to_data_url("agent.wav")],
    },
  )

print(transcript.segments)

import { OpenAI } from "openai";

const openai = new OpenAI({
    apiKey: RELAX_API_KEY,
    baseUrl: 'https://api.relax.ai/v1/'
});

const speakerRef = fs.readFileSync("agent.wav").toString("base64");

const transcript = await openai.audio.transcriptions.create({
  file: fs.createReadStream("meeting.wav"),
  model: "Voxtral-Small-24B-2507",
  response_format: "diarized_json",
  chunking_strategy: "auto",
  extra_body: {
    known_speaker_names: ["agent"],
    known_speaker_references: [`data:audio/wav;base64,${speakerRef}`],
  },
});

console.log(transcript.segments);

curl https://api.relax.ai/v1/audio/transcriptions \
-H "Authorization: Bearer $RELAX_API_KEY" \
-H "Content-Type: multipart/form-data" \
-F file="@/path/to/file/meeting.wav" \
-F model="Voxtral-Small-24B-2507" \
-F response_format="diarized_json" \
-F chunking_strategy=auto \
-F 'known_speaker_names[]=agent' \
-F 'known_speaker_references[]=data:audio/wav;base64,AAA...'

Response

{
  "task": "transcribe",
  "duration": 27.4,
  "text": "Agent: Thanks for calling relaxAI support. A: Hi, I'm trying to enable diarization. Agent: Happy to walk you through the steps.",
  "segments": [
    {
      "type": "transcript.text.segment",
      "id": "seg_001",
      "start": 0.0,
      "end": 4.7,
      "text": "Thanks for calling relaxAI support.",
      "speaker": "agent"
    },
    {
      "type": "transcript.text.segment",
      "id": "seg_002",
      "start": 4.7,
      "end": 11.8,
      "text": "Hi, I'm trying to enable diarization.",
      "speaker": "A"
    },
    {
      "type": "transcript.text.segment",
      "id": "seg_003",
      "start": 12.1,
      "end": 18.5,
      "text": "Happy to walk you through the steps.",
      "speaker": "agent"
    }
  ],
  "usage": {
    "type": "duration",
    "seconds": 27
  }
}

import base64
from openai import OpenAI

client = OpenAI(
  api_key = RELAX_API_KEY,
  base_url = 'https://api.relax.ai/v1/',
)

audio_file = open("speech.mp3", "rb")
transcript = client.audio.transcriptions.create(
  file=audio_file,
  model="Voxtral-Small-24B-2507",
  response_format="verbose_json",
  timestamp_granularities=["word"]
)

print(transcript.words)

import { OpenAI } from "openai";

const openai = new OpenAI({
    apiKey: RELAX_API_KEY,
    baseUrl: 'https://api.relax.ai/v1/'
});

async function main() {
  const transcription = await openai.audio.transcriptions.create({
    file: fs.createReadStream("audio.mp3"),
    model: "Voxtral-Small-24B-2507",
    response_format: "verbose_json",
    timestamp_granularities: ["word"]
  });

  console.log(transcription.text);
}
main();

curl https://api.relax.ai/v1/audio/transcriptions \
-H "Authorization: Bearer $RELAX_API_KEY" \
-H "Content-Type: multipart/form-data" \
-F file="@/path/to/file/audio.mp3" \
-F "timestamp_granularities[]=word" \
-F model="Voxtral-Small-24B-2507" \
-F response_format="verbose_json"

Response

{
  "task": "transcribe",
  "language": "english",
  "duration": 8.470000267028809,
  "text": "The volleyball game was intense and thrilling.",
  "words": [
    {
      "word": "The",
      "start": 0.0,
      "end": 0.23999999463558197
    },
    ...
    {
      "word": "volleyball",
      "start": 7.400000095367432,
      "end": 7.900000095367432
    }
  ],
  "usage": {
    "type": "duration",
    "seconds": 9
  }
}