73 lines
1.9 KiB
Python
73 lines
1.9 KiB
Python
import re
|
|
from datetime import timedelta
|
|
|
|
VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
|
|
VTT_LINE_NUMBER_PATTERN = r"^\d+$"
|
|
|
|
def from_vtt(vtt_string):
|
|
parts = re.split(r'\n\n+', vtt_string.strip())
|
|
|
|
if parts[0].startswith('WEBVTT'):
|
|
parts.pop(0)
|
|
|
|
subtitles = []
|
|
for part in parts:
|
|
lines = part.split('\n')
|
|
match = re.match(VTT_TIMECODE_PATTERN, lines[0])
|
|
if not match:
|
|
if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]):
|
|
lines.pop(0)
|
|
match = re.match(VTT_TIMECODE_PATTERN, lines[0])
|
|
if not match:
|
|
continue
|
|
|
|
start, end = match.groups()
|
|
content = '\n'.join(lines[1:])
|
|
|
|
subtitles.append({
|
|
'start': start,
|
|
'end': end,
|
|
'content': content
|
|
})
|
|
|
|
return subtitles
|
|
|
|
def to_vtt(subtitles):
|
|
vtt_content = "WEBVTT\n\n"
|
|
for idx, subtitle in enumerate(subtitles):
|
|
start = subtitle['start']
|
|
end = subtitle['end']
|
|
content = subtitle['content']
|
|
vtt_content += f"{start} --> {end}\n{content}\n\n"
|
|
return vtt_content.strip()
|
|
|
|
def stack_subtitle():
|
|
buffer = []
|
|
linebuf = []
|
|
for line in parsed_vtt:
|
|
print(line["content"].strip())
|
|
content = line["content"].strip()
|
|
if True:
|
|
linebuf.append(line)
|
|
else:
|
|
linebuf.append(line)
|
|
buffer.append(linebuf)
|
|
linebuf = []
|
|
|
|
sub = []
|
|
for section in buffer:
|
|
strbuf = ""
|
|
for scene in section:
|
|
strbuf += scene["content"]
|
|
# if scene["content"][-1] == ".":
|
|
strbuf += "\n"
|
|
# else:
|
|
# strbuf += " "
|
|
scene["content"] = strbuf
|
|
sub.append(scene)
|
|
|
|
with open("example.vtt", "r") as f:
|
|
vtt_content = f.read()
|
|
|
|
parsed_vtt = from_vtt(vtt_content)
|
|
print(to_vtt(stack_subtitle(parsed_vtt))) |