Frequentie woordvoorkomen uit ondertitelingsbestand (SRT)
Citaat van JeroenSteen op 18 mei 2024, 21:08Momenteel ben ik een beetje bezig met Russisch leren aan de hand van onder andere films. Deze films zijn vaak voorzien van dubbing, maar hebben niet altijd een ondertiteling. Gelukkig is het mogelijk om met Whisper AI de film te laten transcripten. Mijn idee is dan om de ondertiteling ook te gebruiken om de woorden te leren, liefste voordat ik een film kijk. Aan ChatGPT heb ik gevraagd om een Python code te maken die de frequentie woordvoorkomens uitzoekt en onderstaande iets aangepaste code werkt hierbij. Deze code deel ik graag, wellicht hebben anderen er ook wat aan.
import re
from collections import Counter
from itertools import islicedef read_srt_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return contentdef extract_text_from_srt(srt_content):
# Remove all the timestamp lines and blank lines
lines = srt_content.split('\n')
text_lines = []
for line in lines:
if not re.match(r'^\d+$', line) and not re.match(r'^\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}$', line) and line.strip():
text_lines.append(line.strip())
return ' '.join(text_lines)def preprocess_text(text):
# Remove punctuation and lowercase the text
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
return textdef ngrams(text, n):
words = text.split()
return [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]def get_word_frequencies(text):
unigrams = text.split()
bigrams = ngrams(text, 2)
trigrams = ngrams(text, 3)unigram_freq = Counter(unigrams)
bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)return unigram_freq, bigram_freq, trigram_freq
def main(file_path):
srt_content = read_srt_file(file_path)
text = extract_text_from_srt(srt_content)
cleaned_text = preprocess_text(text)unigram_freq, bigram_freq, trigram_freq = get_word_frequencies(cleaned_text)
print("Unigram Frequencies:")
for word, freq in unigram_freq.most_common(100):
print(f"{word}: {freq}")print("\nBigram Frequencies:")
for word, freq in bigram_freq.most_common(100):
print(f"{word}: {freq}")print("\nTrigram Frequencies:")
for word, freq in trigram_freq.most_common(100):
print(f"{word}: {freq}")file_path = 'ondertiteling.srt'
main(file_path)
Momenteel ben ik een beetje bezig met Russisch leren aan de hand van onder andere films. Deze films zijn vaak voorzien van dubbing, maar hebben niet altijd een ondertiteling. Gelukkig is het mogelijk om met Whisper AI de film te laten transcripten. Mijn idee is dan om de ondertiteling ook te gebruiken om de woorden te leren, liefste voordat ik een film kijk. Aan ChatGPT heb ik gevraagd om een Python code te maken die de frequentie woordvoorkomens uitzoekt en onderstaande iets aangepaste code werkt hierbij. Deze code deel ik graag, wellicht hebben anderen er ook wat aan.
import re
from collections import Counter
from itertools import islicedef read_srt_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return contentdef extract_text_from_srt(srt_content):
# Remove all the timestamp lines and blank lines
lines = srt_content.split('\n')
text_lines = []
for line in lines:
if not re.match(r'^\d+$', line) and not re.match(r'^\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}$', line) and line.strip():
text_lines.append(line.strip())
return ' '.join(text_lines)def preprocess_text(text):
# Remove punctuation and lowercase the text
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
return textdef ngrams(text, n):
words = text.split()
return [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]def get_word_frequencies(text):
unigrams = text.split()
bigrams = ngrams(text, 2)
trigrams = ngrams(text, 3)unigram_freq = Counter(unigrams)
bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)return unigram_freq, bigram_freq, trigram_freq
def main(file_path):
srt_content = read_srt_file(file_path)
text = extract_text_from_srt(srt_content)
cleaned_text = preprocess_text(text)unigram_freq, bigram_freq, trigram_freq = get_word_frequencies(cleaned_text)
print("Unigram Frequencies:")
for word, freq in unigram_freq.most_common(100):
print(f"{word}: {freq}")print("\nBigram Frequencies:")
for word, freq in bigram_freq.most_common(100):
print(f"{word}: {freq}")print("\nTrigram Frequencies:")
for word, freq in trigram_freq.most_common(100):
print(f"{word}: {freq}")file_path = 'ondertiteling.srt'
main(file_path)
Citaat van JeroenSteen op 20 mei 2024, 19:35Hierbij een aangepaste versie, waarbij gebruik wordt gemaakt van command-line argumenten voor: bestandsnaam, verbose, aantal woorden overslaan en aantal te vinden woorden.
import re
import argparse
from collections import Counter
from itertools import islicedef read_srt_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return contentdef extract_text_from_srt(srt_content):
lines = srt_content.split('\n')
text_lines = []
for line in lines:
if not re.match(r'^\d+$', line) and not re.match(r'^\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}$', line) and line.strip():
text_lines.append(line.strip())
return ' '.join(text_lines)def preprocess_text(text):
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
return textdef ngrams(text, n):
words = text.split()
return [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]def get_word_frequencies(text):
unigrams = text.split()
bigrams = ngrams(text, 2)
trigrams = ngrams(text, 3)unigram_freq = Counter(unigrams)
bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)return unigram_freq, bigram_freq, trigram_freq
def print_word(word,freq):
if args.verbose:
print(f"{word}: {freq}")
else:
print(f"{word}")def main(args):
srt_content = read_srt_file(args.file)
text = extract_text_from_srt(srt_content)
cleaned_text = preprocess_text(text)unigram_freq, bigram_freq, trigram_freq = get_word_frequencies(cleaned_text)
if args.verbose:
print(f"Processing file: {args.file}")
print(f"Skipping the first {args.skip} and displaying the next {args.count} frequencies.")if args.verbose:
print("Unigram Frequencies:")for word, freq in islice(unigram_freq.most_common()[args.skip:], args.count):
print_word(word,freq)if args.verbose:
print("\nBigram Frequencies:")for word, freq in islice(bigram_freq.most_common()[args.skip:], args.count):
print_word(word,freq)if args.verbose:
print("\nTrigram Frequencies:")for word, freq in islice(trigram_freq.most_common()[args.skip:], args.count):
print_word(word,freq)if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Word frequency analyzer for SRT subtitle files.')
parser.add_argument('file', type=str, help='Path to the SRT file.')
parser.add_argument('--verbose', action='store_true', help='Increase output verbosity.')
parser.add_argument('--skip', type=int, default=10, help='Number of most common words to skip.')
parser.add_argument('--count', type=int, default=100, help='Number of words to display after skipping.')args = parser.parse_args()
main(args)
Hierbij een aangepaste versie, waarbij gebruik wordt gemaakt van command-line argumenten voor: bestandsnaam, verbose, aantal woorden overslaan en aantal te vinden woorden.
import re
import argparse
from collections import Counter
from itertools import islicedef read_srt_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return contentdef extract_text_from_srt(srt_content):
lines = srt_content.split('\n')
text_lines = []
for line in lines:
if not re.match(r'^\d+$', line) and not re.match(r'^\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}$', line) and line.strip():
text_lines.append(line.strip())
return ' '.join(text_lines)def preprocess_text(text):
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
return textdef ngrams(text, n):
words = text.split()
return [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]def get_word_frequencies(text):
unigrams = text.split()
bigrams = ngrams(text, 2)
trigrams = ngrams(text, 3)unigram_freq = Counter(unigrams)
bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)return unigram_freq, bigram_freq, trigram_freq
def print_word(word,freq):
if args.verbose:
print(f"{word}: {freq}")
else:
print(f"{word}")def main(args):
srt_content = read_srt_file(args.file)
text = extract_text_from_srt(srt_content)
cleaned_text = preprocess_text(text)unigram_freq, bigram_freq, trigram_freq = get_word_frequencies(cleaned_text)
if args.verbose:
print(f"Processing file: {args.file}")
print(f"Skipping the first {args.skip} and displaying the next {args.count} frequencies.")if args.verbose:
print("Unigram Frequencies:")for word, freq in islice(unigram_freq.most_common()[args.skip:], args.count):
print_word(word,freq)if args.verbose:
print("\nBigram Frequencies:")for word, freq in islice(bigram_freq.most_common()[args.skip:], args.count):
print_word(word,freq)if args.verbose:
print("\nTrigram Frequencies:")for word, freq in islice(trigram_freq.most_common()[args.skip:], args.count):
print_word(word,freq)if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Word frequency analyzer for SRT subtitle files.')
parser.add_argument('file', type=str, help='Path to the SRT file.')
parser.add_argument('--verbose', action='store_true', help='Increase output verbosity.')
parser.add_argument('--skip', type=int, default=10, help='Number of most common words to skip.')
parser.add_argument('--count', type=int, default=100, help='Number of words to display after skipping.')args = parser.parse_args()
main(args)