YouTube video to transcript using openAI whisper and summary using OLLama

Author

Nipun Batra

Published

December 18, 2023

try:
    from pydub import AudioSegment
except ImportError:
    %pip install pydub
    %pip install pydub[extras]
    from pydub import AudioSegment
    from pydub.playback import play

from IPython.display import Audio
audio_path = '../datasets/audio/Prime-minister.m4a'
audio = AudioSegment.from_file(audio_path, format="m4a")
audio

try:    
    import whisper
except ImportError:
    %pip install openai-whisper
    import whisper

whisper_model = whisper.load_model("base.en")

transcription = whisper_model.transcribe(audio_path, fp16=True, verbose=False)

100%|██████████| 347/347 [00:00<00:00, 367.83frames/s]

transcription

{'text': ' Who is the Prime Minister of India?',
 'segments': [{'id': 0,
   'seek': 0,
   'start': 0.0,
   'end': 3.0,
   'text': ' Who is the Prime Minister of India?',
   'tokens': [50363, 5338, 318, 262, 5537, 4139, 286, 3794, 30, 50513],
   'temperature': 0.0,
   'avg_logprob': -0.34697675704956055,
   'compression_ratio': 0.813953488372093,
   'no_speech_prob': 0.005249415524303913}],
 'language': 'en'}

from IPython.display import Audio

try:
    from gtts import gTTS
except ImportError:
    %pip install gtts
    from gtts import gTTS

def speak(text, file):
    tts = gTTS(text, lang='en')
    with open(file, 'wb') as f:
        tts.write_to_fp(f)
    return Audio(file)

speak(transcription['text'], '../datasets/audio/pm-2.mp3')

from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

llm = Ollama(model="llama2", 
             callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]))

def answers(llm, prompt_qs, prompts, text):
    outputs = []
    for prompt, prompt_qs in zip(prompts, prompt_qs):
        print(prompt_qs, end="\n")
        output = llm(prompt, temperature=0.5)
        #print(output, end="\n\n")
        print("\n" + "=="*50, end="\n\n")
    outputs.append(output) 
    return outputs

prompt_qs = ["Please be concise."] 
prompts = [q + ":"+ transcription["text"] for q in prompt_qs]

outputs = answers(llm, prompt_qs, prompts, transcription["text"])

Please be concise.

The Prime Minister of India is Narendra Modi.
====================================================================================================

speak(outputs[0].replace("\n", ""), '../datasets/audio/pm-answer.mp3')

References

from IPython.display import YouTubeVideo
YouTubeVideo('https://www.youtube.com/watch?v=CuBzyh4Xmvk', width=500, height=300)

try:
    import yt_dlp
except ImportError:
    %pip install yt_dlp
    import yt_dlp

def download(video_id: str, save_path: str) -> str:
    video_url = f'https://www.youtube.com/watch?v={video_id}'
    ydl_opts = {
        'format': 'm4a/bestaudio/best',
        'paths': {'home': save_path},
        'outtmpl': {'default': "lecture.m4a"},
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'm4a',
        }]
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        error_code = ydl.download([video_url])
        if error_code != 0:
            raise Exception('Failed to download video')

    return save_path

download('CuBzyh4Xmvk', '../datasets/audio/')

[youtube] Extracting URL: https://www.youtube.com/watch?v=CuBzyh4Xmvk
[youtube] CuBzyh4Xmvk: Downloading webpage
[youtube] CuBzyh4Xmvk: Downloading ios player API JSON
[youtube] CuBzyh4Xmvk: Downloading android player API JSON
[youtube] CuBzyh4Xmvk: Downloading m3u8 information
[info] CuBzyh4Xmvk: Downloading 1 format(s): 140
[download] ../datasets/audio/lecture.m4a has already been downloaded
[download] 100% of   72.26MiB
[ExtractAudio] Not converting audio ../datasets/audio/lecture.m4a; file is already in target format m4a

'../datasets/audio/'

audio_path = '../datasets/audio/lecture.m4a'
audio = AudioSegment.from_file(audio_path, format="m4a")

audio[:13000]

transcription = whisper_model.transcribe("../datasets/audio/lecture.m4a", fp16=True, verbose=False)

 99%|█████████▉| 465481/468481 [02:07<00:00, 3643.86frames/s]

print(transcription["text"][:500].replace(". ", "\n"))

 Please look at the code mentioned above and please sign up on the Google Cloud
We've already started making some announcements
You will likely end up missing the announcements and you'll have no one else to play with
The second quick logistical announcement is that we'll have an extra lecture on Saturday, 11th Jan at 11am in 1.101
So a lot of ones over there
And I think one or two people still have conflict, but in the larger, in the larger phone we'll have almost everyone available, so we

transcription.keys()

dict_keys(['text', 'segments', 'language'])

def create_srt_from_transcription(transcription_objects, srt_file_path):
    with open(srt_file_path, 'w') as srt_file:
        index = 1  # SRT format starts with index 1

        for entry in transcription_objects['segments']:
            start_time = entry['start']
            end_time = entry['end']
            text = entry['text']

            # Convert time to SRT format
            start_time_str = format_time(start_time)
            end_time_str = format_time(end_time)

            # Write entry to SRT file
            srt_file.write(f"{index}\n")
            srt_file.write(f"{start_time_str} --> {end_time_str}\n")
            srt_file.write(f"{text}\n\n")

            index += 1

def format_time(time_seconds):
    minutes, seconds = divmod(time_seconds, 60)
    hours, minutes = divmod(minutes, 60)
    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},000"

create_srt_from_transcription(transcription, "../datasets/audio/lecture.srt")

!head ../datasets/audio/lecture.srt

1
00:00:00,000 --> 00:00:05,000
 Please look at the code mentioned above and please sign up on the Google Cloud.

2
00:00:05,000 --> 00:00:08,000
 We've already started making some announcements.

3
00:00:08,000 --> 00:00:14,000

speak(transcription['text'][:1300], '../datasets/audio/hello.mp3')

try:
    from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
except:
    %pip install transformers -U -q
    %pip install sentencepiece
    from transformers import MBartForConditionalGeneration, MBart50TokenizerFast


# download and save model
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")

# import tokenizer
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")

text_to_translate = transcription["text"][:500].split(". ")
text_to_translate

[' Please look at the code mentioned above and please sign up on the Google Cloud',
 "We've already started making some announcements",
 "You will likely end up missing the announcements and you'll have no one else to play with",
 "The second quick logistical announcement is that we'll have an extra lecture on Saturday, 11th Jan at 11am in 1.101",
 'So a lot of ones over there',
 "And I think one or two people still have conflict, but in the larger, in the larger phone we'll have almost everyone available, so we"]

model_inputs = tokenizer(text_to_translate, return_tensors="pt", padding=True, truncation=True)

generated_tokens = model.generate(
    **model_inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"]
)

translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

translation

['कृपया उपर्युक्त कोड को देखें और कृपया Google क्लाउड पर साइन अप करें',
 'हम पहले से ही कुछ घोषणाएं करने शुरू कर दी हैं',
 'आप शायद अंत में घोषणाओं को खो देंगे और आप के साथ खेलने के लिए कोई अन्य नहीं होगा',
 'दूसरा त्वरित लॉजिस्टिक घोषणा यह है कि हम एक अतिरिक्त व्याख्यान Saturday, 11th Jan 11am में 1.101 में होगा',
 'तो वहाँ के बहुत से',
 'और मुझे लगता है कि एक या दो लोग अभी भी संघर्ष है, लेकिन बड़ी, बड़ी फोन में हम लगभग सभी उपलब्ध हो जाएगा, तो हम']

llm = Ollama(model="mistral", 
             callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]))
prompt_qs = ["Please provide a bullet-point summary for the given text:",
             "Highlight the important topics and subtopics in the given lecture:",
             "Give us some question for a quiz based on the following text:",
             "Summarize the following text in Hindi in 10 lines or less:",
            ]

prompts = [q + "\n\n" + transcription["text"] for q in prompt_qs]

outputs = answers(llm, prompt_qs, prompts, transcription["text"])

Please provide a bullet-point summary for the given text:
 * The text discusses a machine learning course and announces several logistical matters, including signing up for Google Cloud, an extra lecture on Saturday, and providing access to Google Docs for FAQ and project questions.
* The definition of machine learning is discussed, with the ability to learn without explicit programming being highlighted.
* A task to recognize digits from a dataset is introduced as an example, and rules are suggested for recognizing the digit "4".
* It is explained that traditional programming involves explicitly programming rules, while machine learning involves using data and experience to learn patterns and make predictions.
* An example of predicting tomato quality based on visual features is given, with the goal being to scale up this process in a business setting.
* The concept of precision and recall in machine learning evaluation metrics is touched upon, as well as the idea of a decision tree algorithm for classification tasks.
* The text encourages students to come up with simple rules for recognizing patterns and using decision trees to make predictions based on those rules.
* The greedy algorithm for finding the best attribute for splitting data in a decision tree is mentioned, along with the concept of entropy as a measure of disorder or uncertainty in a dataset.
====================================================================================================

Highlight the important topics and subtopics in the given lecture:
 The given lecture covers several important topics related to machine learning, including:

1. Machine Learning Definition and Concepts
* Explicit programming vs. machine learning
* Linear programming vs. machine learning
* Learning into a computer program
2. Recognizing Digits using Machine Learning
* Writing rules to recognize digits from dataset
3. Machine Learning Algorithms and Techniques
* Decision Trees for Classification Problems
4. Performance Measures in Machine Learning
* Accuracy, Precision, Recall, F-score, and Matthew's Correlation Coefficient
5. Optimal Decision Tree and Greedy Algorithm
6. Data Preprocessing and Feature Selection
7. Entropy, Information Gain, and Attribute Selection
8. Decision Tree Implementation and Details
9. Limitations and Future Work in Machine Learning

The lecture also includes discussions on the importance of data preprocessing, feature selection, and understanding performance measures for evaluating machine learning models effectively. It is important to note that this list might not be exhaustive, but it covers the main topics mentioned in the given lecture.
====================================================================================================

Give us some question for a quiz based on the following text:
 1. What is machine learning and when was it first introduced?
2. What is the difference between explicit programming and machine learning?
3. In the context of machine learning, what is a training set and a test set?
4. What are some rules for recognizing the digit "4" in an image dataset?
5. What is precision and recall in machine learning?
6. What is the difference between precision and Matthew's correlation coefficient?
7. In the given example, what is the precision, recall, F score, and Matthew's correlation coefficient for predicting cancerous or not based on a dataset with 91 entries, of which 90 are not cancerous and 1 is cancerous?
8. What is the main difference between decision trees and other machine learning algorithms?
9. How does a decision tree algorithm work to classify data based on attributes?
10. What is entropy in information theory and how is it related to decision trees?
====================================================================================================

Summarize the following text in Hindi in 10 lines or less:
 हेज़रूदीन भाषा में 10 शोधनावलिका:

1. यहाँ देखें लोगों को Google Cloud पर स्IGN UP करें। अगर आप मिटावे पहुँचती हैं, तो बेहतरीन शायद अपने साथ लगे जाएँगे। पहले तक हमें कुछ अख्बरें दिये गये हैं, वहाँ आप नहीं पहूँचेंगे और कोई भी साथी नहीं होगा।
2. सब्बत, 11-01-2023 रात्रिकाल 11:00 वज़न में एक और पदार्थ होगा। यहाँ कुछ लोगों की संख्या काफी बढ़ जाती है, और अधिक लोगों में सभी पहुँचेंगे।
3. FAQ और प्रोजेक्ट्स जिसे Google Docs में पहले शार्तीय थे, आप सभी टिप्साहित रखा जाएंगे। अगर आपको कुछ सवाल है, तो इसमें टिप्साहित शून्य भागान बनाएं और हमें दिजिएं।
4. अगर आपको कुछ प्रश्न है, तो Google Docs पर प्रोजेक्ट्स के लिए टिप्साहित दूं।
5. पहले ध्यान रखें कि वही चीज़ पर और जो वही समझाई गयी थी, वही समझाएं।
6. Arthur Sandler द्वारा 1959 से पहले के साथ उन्होंने "मशीन लर्निंग" (Machine Learning) का शब्द पहली बातचीत की।
7. एक सामग्री के लिए वेज़न से अपनी शिकाई करने के लिए इंजामत है।
8. प्रोग्राम स्वयं प्रोग्राम है, नहीं दोनों तक प्रोग्राम होते हैं।
9. आज के लिए काम करने वाले प्रश्रेण की बताव क्या है? यह सामग्री के वेज़न से अपनी शिकाई करने के लिए इंजामत है।
10. दिए गये डिगिट्स (0-9) के लिए, पहली अध्ययन के लिए एक प्रोग्राम बनाएं जिससे वे दिगिट्स recognize करें।
====================================================================================================