Friday, February 23, 2024

Build Your Own Audio Transcription Tool with AI

 In this video, you will learn how to deploy a LLM based application intro production by using Amazon Bedrock, Amazon Transcribe to summarize audio files with ASR model, Titan. 


import boto3
import json
import uuid
import time
from jinja2 import Template

bedrock_runtime = boto3.client('bedrock-runtime', region_name='us-east-1')

s3_client = boto3.client(service_name='s3', region_name='ap-southeast-2')

transcribe_client = boto3.client('transcribe', region_name='ap-southeast-2')

bucket_name='<replace your bucket name here>'

#file_name = 'angry.mp3'
file_name = 'happy.mp3'

job_name = 'transcription-job-' + str(uuid.uuid4())

response = transcribe_client.start_transcription_job(
    Media={'MediaFileUri': f's3://{bucket_name}/{file_name}'},
        'ShowSpeakerLabels': True,
        'MaxSpeakerLabels': 2

while True:
    status = transcribe_client.get_transcription_job(TranscriptionJobName=job_name)
    if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:

if status['TranscriptionJob']['TranscriptionJobStatus'] == 'COMPLETED':
    # Load the transcript from S3.
    transcript_key = f"{job_name}.json"
    transcript_obj = s3_client.get_object(Bucket=bucket_name, Key=transcript_key)
    transcript_text = transcript_obj['Body'].read().decode('utf-8')
    transcript_json = json.loads(transcript_text)
    output_text = ""
    current_speaker = None
    items = transcript_json['results']['items']
    for item in items:
        speaker_label = item.get('speaker_label', None)
        content = item['alternatives'][0]['content']
        # Start the line with the speaker label:
        if speaker_label is not None and speaker_label != current_speaker:
            current_speaker = speaker_label
            output_text += f"\n{current_speaker}: "
        # Add the speech content:
        if item['type'] == 'punctuation':
            output_text = output_text.rstrip()
        output_text += f"{content} "
    # Save the transcript to a text file
    with open(f'{job_name}.txt', 'w') as f:

with open(f'{job_name}.txt', "r") as file:
    transcript =

template_string = """ I need to summarize a conversation. The transcript of the
conversation is between the <data> XML like tags.


The summary must contain a one word sentiment analysis, and
a list of issues, problems or causes of friction
during the conversation. The output must be provided in
JSON format shown in the following example.

Example output:
    "sentiment": <sentiment>,
    "issues": [
            "topic": <topic>,
            "summary": <issue_summary>,

Write the JSON output and nothing more.

Here is the JSON output:   """

data = {
    'transcript' : transcript

template = Template(template_string)

prompt = template.render(data)

kwargs = {
    "modelId": "amazon.titan-text-express-v1",
    "contentType": "application/json",
    "accept": "*/*",
    "body": json.dumps(
            "inputText": prompt,
            "textGenerationConfig": {
                "maxTokenCount": 512,
                "temperature": 0,
                "topP": 0.9

response = bedrock_runtime.invoke_model(**kwargs)

response_body = json.loads(response.get('body').read())
generation = response_body['results'][0]['outputText']


1 comment:

Akhilesh Kumbhar said...

Hello Sir!
I watched your YouTube video on the same and reached your blog.
I want to know how I can perform real-time transcription of the audio input from the mic by using Amazon Transcribe, without having to use an S3 bucket?
Thank you