Friday, February 23, 2024

Build Your Own Audio Transcription Tool with AI

 In this video, you will learn how to deploy a LLM based application intro production by using Amazon Bedrock, Amazon Transcribe to summarize audio files with ASR model, Titan. 



Code:



import boto3
import json
import uuid
import time
from jinja2 import Template

bedrock_runtime = boto3.client('bedrock-runtime', region_name='us-east-1')

s3_client = boto3.client(service_name='s3', region_name='ap-southeast-2')

transcribe_client = boto3.client('transcribe', region_name='ap-southeast-2')

bucket_name='<replace your bucket name here>'

#file_name = 'angry.mp3'
file_name = 'happy.mp3'

job_name = 'transcription-job-' + str(uuid.uuid4())

response = transcribe_client.start_transcription_job(
    TranscriptionJobName=job_name,
    Media={'MediaFileUri': f's3://{bucket_name}/{file_name}'},
    MediaFormat='mp3',
    LanguageCode='en-US',
    OutputBucketName=bucket_name,
    Settings={
        'ShowSpeakerLabels': True,
        'MaxSpeakerLabels': 2
    }
)

while True:
    status = transcribe_client.get_transcription_job(TranscriptionJobName=job_name)
    if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
        break
    time.sleep(2)
print(status['TranscriptionJob']['TranscriptionJobStatus'])

if status['TranscriptionJob']['TranscriptionJobStatus'] == 'COMPLETED':
   
    # Load the transcript from S3.
    transcript_key = f"{job_name}.json"
    transcript_obj = s3_client.get_object(Bucket=bucket_name, Key=transcript_key)
    transcript_text = transcript_obj['Body'].read().decode('utf-8')
    transcript_json = json.loads(transcript_text)
   
    output_text = ""
    current_speaker = None
   
    items = transcript_json['results']['items']
   
    for item in items:
       
        speaker_label = item.get('speaker_label', None)
        content = item['alternatives'][0]['content']
       
        # Start the line with the speaker label:
        if speaker_label is not None and speaker_label != current_speaker:
            current_speaker = speaker_label
            output_text += f"\n{current_speaker}: "
           
        # Add the speech content:
        if item['type'] == 'punctuation':
            output_text = output_text.rstrip()
           
        output_text += f"{content} "
       
    # Save the transcript to a text file
    with open(f'{job_name}.txt', 'w') as f:
        f.write(output_text)

with open(f'{job_name}.txt', "r") as file:
    transcript = file.read()

template_string = """ I need to summarize a conversation. The transcript of the
conversation is between the <data> XML like tags.

<data>
{{transcript}}
</data>

The summary must contain a one word sentiment analysis, and
a list of issues, problems or causes of friction
during the conversation. The output must be provided in
JSON format shown in the following example.

Example output:
{
    "sentiment": <sentiment>,
    "issues": [
        {
            "topic": <topic>,
            "summary": <issue_summary>,
        }
    ]
}

Write the JSON output and nothing more.

Here is the JSON output:   """

data = {
    'transcript' : transcript
}

template = Template(template_string)

prompt = template.render(data)

kwargs = {
    "modelId": "amazon.titan-text-express-v1",
    "contentType": "application/json",
    "accept": "*/*",
    "body": json.dumps(
        {
            "inputText": prompt,
            "textGenerationConfig": {
                "maxTokenCount": 512,
                "temperature": 0,
                "topP": 0.9
            }
        }
    )
}

response = bedrock_runtime.invoke_model(**kwargs)

response_body = json.loads(response.get('body').read())
generation = response_body['results'][0]['outputText']

print(generation)

1 comment:

Akhilesh Kumbhar said...

Hello Sir!
I watched your YouTube video on the same and reached your blog.
I want to know how I can perform real-time transcription of the audio input from the mic by using Amazon Transcribe, without having to use an S3 bucket?
Thank you