Skip to content

Create transcription

Transcribes audio files into text using relaxAI’s advanced speech-to-text models. This endpoint supports various audio formats and provides options for customization, including language selection and speaker diarization.

POST https://api.relax.ai/v1/audio/transcriptions

from openai import OpenAI
client = OpenAI(
api_key = RELAX_API_KEY,
base_url = 'https://api.relax.ai/v1/',
)
audio_file = open("speech.mp3", "rb")
transcript = client.audio.transcriptions.create(
model="Voxtral-Small-24B-2507",
file=audio_file
)
print(transcript.text)
import { OpenAI } from "openai";
const openai = new OpenAI({
apiKey: RELAX_API_KEY,
baseUrl: 'https://api.relax.ai/v1/'
});
async function main() {
const transcription = await openai.audio.transcriptions.create({
file: fs.createReadStream("audio.mp3"),
model: "Voxtral-Small-24B-2507",
});
console.log(transcription.text);
}
main();
Terminal window
curl https://api.relax.ai/v1/chat/completions \
-H "Authorization: Bearer $RELAX_API_KEY" \
-H "Content-Type: multipart/form-data" \
-F file="@/path/to/file/audio.mp3" \
-F model="Voxtral-Small-24B-2507"

Returns a 200 OK response code with a JSON object containing the transcription details.


Transcription Response
{
"text": "In this video, we will explore conversation dialogues between two friends...",
"logprobs": null,
"usage": {
"input_tokens": 3000,
"output_tokens": 111,
"total_tokens": 3111,
"type": "duration",
"input_token_details": {
"audio_tokens": 3000,
"text_tokens": 111
},
"seconds": 60
},
"duration": 0,
"language": "",
"segments": null,
"words": null
}

The following parameters can be included in the request body:


ParameterTypeRequiredDescription
filefileYesThe audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
modelstringYesID of the model to use. The options are Voxtral-Small-24B-2507.
known_speaker_namesarrayNoOptional list of speaker names that correspond to the audio samples provided in known_speaker_references[]. Each entry should be a short identifier (for example customer or agent). Up to 4 speakers are supported.
known_speaker_referencesarrayNoOptional list of audio samples (as data URLs) that contain known speaker references matching known_speaker_names[]. Each sample must be between 2 and 10 seconds, and can use any of the same input audio formats supported by file.
languagestringNoThe language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency.
promptstringNoAn optional text to guide the model’s style or continue a previous audio segment. The prompt should match the audio language. The prompt parameter is not supported when using gpt-4o-transcribe-diarize.
response_formatstringNoThe format of the output, in one of the following options: json, text, srt, verbose_json, vtt, or diarized_json. Default: json
streambooleanNoIf set to true, the model response data will be streamed to the client as it is generated using server-sent events. See the Streaming section of the Speech-to-Text guide for more information. Default: false
temperaturenumberNoThe sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit. Default: 0
timestamp_granularitiesarrayNoThe timestamp granularities to populate for this transcription. response_format must be set to verbose_json to use timestamp granularities. Currently only word is supported.

import base64
from openai import OpenAI
client = OpenAI(
api_key = RELAX_API_KEY,
base_url = 'https://api.relax.ai/v1/',
)
def to_data_url(path: str) -> str:
with open(path, "rb") as fh:
return "data:audio/wav;base64," + base64.b64encode(fh.read()).decode("utf-8")
with open("meeting.wav", "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="Voxtral-Small-24B-2507",
file=audio_file,
response_format="diarized_json",
chunking_strategy="auto",
extra_body={
"known_speaker_names": ["agent"],
"known_speaker_references": [to_data_url("agent.wav")],
},
)
print(transcript.segments)
import { OpenAI } from "openai";
const openai = new OpenAI({
apiKey: RELAX_API_KEY,
baseUrl: 'https://api.relax.ai/v1/'
});
const speakerRef = fs.readFileSync("agent.wav").toString("base64");
const transcript = await openai.audio.transcriptions.create({
file: fs.createReadStream("meeting.wav"),
model: "Voxtral-Small-24B-2507",
response_format: "diarized_json",
chunking_strategy: "auto",
extra_body: {
known_speaker_names: ["agent"],
known_speaker_references: [`data:audio/wav;base64,${speakerRef}`],
},
});
console.log(transcript.segments);
Terminal window
curl https://api.relax.ai/v1/audio/transcriptions \
-H "Authorization: Bearer $RELAX_API_KEY" \
-H "Content-Type: multipart/form-data" \
-F file="@/path/to/file/meeting.wav" \
-F model="Voxtral-Small-24B-2507" \
-F response_format="diarized_json" \
-F chunking_strategy=auto \
-F 'known_speaker_names[]=agent' \
-F 'known_speaker_references[]=data:audio/wav;base64,AAA...'
{
"task": "transcribe",
"duration": 27.4,
"text": "Agent: Thanks for calling relaxAI support. A: Hi, I'm trying to enable diarization. Agent: Happy to walk you through the steps.",
"segments": [
{
"type": "transcript.text.segment",
"id": "seg_001",
"start": 0.0,
"end": 4.7,
"text": "Thanks for calling relaxAI support.",
"speaker": "agent"
},
{
"type": "transcript.text.segment",
"id": "seg_002",
"start": 4.7,
"end": 11.8,
"text": "Hi, I'm trying to enable diarization.",
"speaker": "A"
},
{
"type": "transcript.text.segment",
"id": "seg_003",
"start": 12.1,
"end": 18.5,
"text": "Happy to walk you through the steps.",
"speaker": "agent"
}
],
"usage": {
"type": "duration",
"seconds": 27
}
}