Forum

of Registreren om berichten en onderwerpen te maken.

Frequentie woordvoorkomen uit ondertitelingsbestand (SRT)

Momenteel ben ik een beetje bezig met Russisch leren aan de hand van onder andere films. Deze films zijn vaak voorzien van dubbing, maar hebben niet altijd een ondertiteling. Gelukkig is het mogelijk om met Whisper AI de film te laten transcripten. Mijn idee is dan om de ondertiteling ook te gebruiken om de woorden te leren, liefste voordat ik een film kijk. Aan ChatGPT heb ik gevraagd om een Python code te maken die de frequentie woordvoorkomens uitzoekt en onderstaande iets aangepaste code werkt hierbij. Deze code deel ik graag, wellicht hebben anderen er ook wat aan.

import re
from collections import Counter
from itertools import islice

def read_srt_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return content

def extract_text_from_srt(srt_content):
# Remove all the timestamp lines and blank lines
lines = srt_content.split('\n')
text_lines = []
for line in lines:
if not re.match(r'^\d+$', line) and not re.match(r'^\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}$', line) and line.strip():
text_lines.append(line.strip())
return ' '.join(text_lines)

def preprocess_text(text):
# Remove punctuation and lowercase the text
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
return text

def ngrams(text, n):
words = text.split()
return [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

def get_word_frequencies(text):
unigrams = text.split()
bigrams = ngrams(text, 2)
trigrams = ngrams(text, 3)

unigram_freq = Counter(unigrams)
bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)

return unigram_freq, bigram_freq, trigram_freq

def main(file_path):
srt_content = read_srt_file(file_path)
text = extract_text_from_srt(srt_content)
cleaned_text = preprocess_text(text)

unigram_freq, bigram_freq, trigram_freq = get_word_frequencies(cleaned_text)

print("Unigram Frequencies:")
for word, freq in unigram_freq.most_common(100):
print(f"{word}: {freq}")

print("\nBigram Frequencies:")
for word, freq in bigram_freq.most_common(100):
print(f"{word}: {freq}")

print("\nTrigram Frequencies:")
for word, freq in trigram_freq.most_common(100):
print(f"{word}: {freq}")

file_path = 'ondertiteling.srt'
main(file_path)

Hierbij een aangepaste versie, waarbij gebruik wordt gemaakt van command-line argumenten voor: bestandsnaam, verbose, aantal woorden overslaan en aantal te vinden woorden.

import re
import argparse
from collections import Counter
from itertools import islice

def read_srt_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return content

def extract_text_from_srt(srt_content):
lines = srt_content.split('\n')
text_lines = []
for line in lines:
if not re.match(r'^\d+$', line) and not re.match(r'^\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}$', line) and line.strip():
text_lines.append(line.strip())
return ' '.join(text_lines)

def preprocess_text(text):
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
return text

def ngrams(text, n):
words = text.split()
return [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

def get_word_frequencies(text):
unigrams = text.split()
bigrams = ngrams(text, 2)
trigrams = ngrams(text, 3)

unigram_freq = Counter(unigrams)
bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)

return unigram_freq, bigram_freq, trigram_freq

def print_word(word,freq):
if args.verbose:
print(f"{word}: {freq}")
else:
print(f"{word}")

def main(args):
srt_content = read_srt_file(args.file)
text = extract_text_from_srt(srt_content)
cleaned_text = preprocess_text(text)

unigram_freq, bigram_freq, trigram_freq = get_word_frequencies(cleaned_text)

if args.verbose:
print(f"Processing file: {args.file}")
print(f"Skipping the first {args.skip} and displaying the next {args.count} frequencies.")

if args.verbose:
print("Unigram Frequencies:")

for word, freq in islice(unigram_freq.most_common()[args.skip:], args.count):
print_word(word,freq)

if args.verbose:
print("\nBigram Frequencies:")

for word, freq in islice(bigram_freq.most_common()[args.skip:], args.count):
print_word(word,freq)

if args.verbose:
print("\nTrigram Frequencies:")

for word, freq in islice(trigram_freq.most_common()[args.skip:], args.count):
print_word(word,freq)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Word frequency analyzer for SRT subtitle files.')
parser.add_argument('file', type=str, help='Path to the SRT file.')
parser.add_argument('--verbose', action='store_true', help='Increase output verbosity.')
parser.add_argument('--skip', type=int, default=10, help='Number of most common words to skip.')
parser.add_argument('--count', type=int, default=100, help='Number of words to display after skipping.')

args = parser.parse_args()
main(args)