Fix
This commit is contained in:
parent
f5f9238161
commit
bfc4d4b089
|
@ -4,7 +4,7 @@ from datetime import timedelta
|
||||||
VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
|
VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
|
||||||
VTT_LINE_NUMBER_PATTERN = r"^\d+$"
|
VTT_LINE_NUMBER_PATTERN = r"^\d+$"
|
||||||
|
|
||||||
def parse_vtt(vtt_string):
|
def from_vtt(vtt_string):
|
||||||
parts = re.split(r'\n\n+', vtt_string.strip())
|
parts = re.split(r'\n\n+', vtt_string.strip())
|
||||||
|
|
||||||
if parts[0].startswith('WEBVTT'):
|
if parts[0].startswith('WEBVTT'):
|
||||||
|
@ -35,47 +35,39 @@ def parse_vtt(vtt_string):
|
||||||
def to_vtt(subtitles):
|
def to_vtt(subtitles):
|
||||||
vtt_content = "WEBVTT\n\n"
|
vtt_content = "WEBVTT\n\n"
|
||||||
for idx, subtitle in enumerate(subtitles):
|
for idx, subtitle in enumerate(subtitles):
|
||||||
# print(subtitle, idx)
|
|
||||||
start = subtitle['start']
|
start = subtitle['start']
|
||||||
end = subtitle['end']
|
end = subtitle['end']
|
||||||
content = subtitle['content']
|
content = subtitle['content']
|
||||||
vtt_content += f"{start} --> {end}\n{content}\n\n"
|
vtt_content += f"{start} --> {end}\n{content}\n\n"
|
||||||
return vtt_content.strip()
|
return vtt_content.strip()
|
||||||
|
|
||||||
|
def stack_subtitle():
|
||||||
|
buffer = []
|
||||||
|
linebuf = []
|
||||||
|
for line in parsed_vtt:
|
||||||
|
print(line["content"].strip())
|
||||||
|
content = line["content"].strip()
|
||||||
|
if True:
|
||||||
|
linebuf.append(line)
|
||||||
|
else:
|
||||||
|
linebuf.append(line)
|
||||||
|
buffer.append(linebuf)
|
||||||
|
linebuf = []
|
||||||
|
|
||||||
|
sub = []
|
||||||
|
for section in buffer:
|
||||||
|
strbuf = ""
|
||||||
|
for scene in section:
|
||||||
|
strbuf += scene["content"]
|
||||||
|
# if scene["content"][-1] == ".":
|
||||||
|
strbuf += "\n"
|
||||||
|
# else:
|
||||||
|
# strbuf += " "
|
||||||
|
scene["content"] = strbuf
|
||||||
|
sub.append(scene)
|
||||||
|
|
||||||
with open("example.vtt", "r") as f:
|
with open("example.vtt", "r") as f:
|
||||||
vtt_content = f.read()
|
vtt_content = f.read()
|
||||||
|
|
||||||
parsed_vtt = parse_vtt(vtt_content)
|
parsed_vtt = from_vtt(vtt_content)
|
||||||
#print(len(parsed_vtt))
|
print(to_vtt(stack_subtitle(parsed_vtt)))
|
||||||
|
|
||||||
buffer = []
|
|
||||||
linebuf = []
|
|
||||||
|
|
||||||
for line in parsed_vtt:
|
|
||||||
# print(line["content"].strip())
|
|
||||||
content = line["content"].strip()
|
|
||||||
if "".join([i["content"] for i in linebuf]).count(".") < 4 or len(linebuf) < 5:
|
|
||||||
linebuf.append(line)
|
|
||||||
else:
|
|
||||||
linebuf.append(line)
|
|
||||||
buffer.append(linebuf)
|
|
||||||
linebuf = []
|
|
||||||
|
|
||||||
# print(buffer)
|
|
||||||
|
|
||||||
sub = []
|
|
||||||
for section in buffer:
|
|
||||||
strbuf = ""
|
|
||||||
for scene in section:
|
|
||||||
strbuf += scene["content"]
|
|
||||||
# if scene["content"][-1] == ".":
|
|
||||||
strbuf += "\n"
|
|
||||||
# else:
|
|
||||||
# strbuf += " "
|
|
||||||
scene["content"] = strbuf
|
|
||||||
sub.append(scene)
|
|
||||||
|
|
||||||
# print(buffer[0])
|
|
||||||
|
|
||||||
print(to_vtt(sub))
|
|
|
@ -0,0 +1,231 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import re, json
|
||||||
|
import os
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
|
def from_vtt(vtt_string):
|
||||||
|
VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
|
||||||
|
VTT_LINE_NUMBER_PATTERN = r"^\d+$"
|
||||||
|
parts = re.split(r'\n\n+', vtt_string.strip())
|
||||||
|
if parts[0].startswith('WEBVTT'):
|
||||||
|
parts.pop(0)
|
||||||
|
|
||||||
|
subtitles = []
|
||||||
|
for part in parts:
|
||||||
|
lines = part.split('\n')
|
||||||
|
match = re.match(VTT_TIMECODE_PATTERN, lines[0])
|
||||||
|
if not match:
|
||||||
|
if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]):
|
||||||
|
lines.pop(0)
|
||||||
|
match = re.match(VTT_TIMECODE_PATTERN, lines[0])
|
||||||
|
if not match:
|
||||||
|
continue
|
||||||
|
|
||||||
|
start, end = match.groups()
|
||||||
|
content = '\n'.join(lines[1:])
|
||||||
|
subtitles.append({
|
||||||
|
'start': start,
|
||||||
|
'end': end,
|
||||||
|
'content': content
|
||||||
|
})
|
||||||
|
|
||||||
|
return subtitles
|
||||||
|
|
||||||
|
def to_vtt(subtitles):
|
||||||
|
vtt_content = "WEBVTT\n\n\n"
|
||||||
|
for idx, subtitle in enumerate(subtitles):
|
||||||
|
content = subtitle['content']
|
||||||
|
if not subtitle.get("split", False):
|
||||||
|
start = subtitle['start']
|
||||||
|
end = subtitle['end']
|
||||||
|
vtt_content += f"{start} --> {end}\n{content}\n\n\n"
|
||||||
|
else:
|
||||||
|
vtt_content += f"NOTE {content}\n\n\n"
|
||||||
|
|
||||||
|
return vtt_content.strip()
|
||||||
|
|
||||||
|
def to_stacked_vtt(subtitles):
|
||||||
|
vtt_content = "WEBVTT\n\n\n"
|
||||||
|
buffer = ""
|
||||||
|
for subtitle in subtitles:
|
||||||
|
if subtitle.get("split", False):
|
||||||
|
buffer = ""
|
||||||
|
continue
|
||||||
|
if len(buffer) != 0:
|
||||||
|
if str(subtitle['content'].strip())[-1] == ".":
|
||||||
|
buffer += "\n"
|
||||||
|
else:
|
||||||
|
buffer += " "
|
||||||
|
buffer += subtitle['content'].strip()
|
||||||
|
vtt_content += f"{to_time(subtitle['start'])} --> {to_time(subtitle['end'])}\n"
|
||||||
|
vtt_content += buffer
|
||||||
|
vtt_content += "\n\n\n"
|
||||||
|
|
||||||
|
def create_word_scenes(wordvtt, scriptraw):
|
||||||
|
subtitles = from_vtt(wordvtt)
|
||||||
|
scripts = [i for i in scriptraw.split("\n") if i]
|
||||||
|
print(f"VTT {len(subtitles)} lines, Script {len(scripts)} lines")
|
||||||
|
scenes = []
|
||||||
|
for n, script in enumerate(scripts):
|
||||||
|
if len(script.split(" ")) == 1:
|
||||||
|
continue
|
||||||
|
scenes.append({"scene": script, "timestamp": []})
|
||||||
|
|
||||||
|
scenes_cur = 0
|
||||||
|
for n, subtitle in enumerate(subtitles):
|
||||||
|
sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
|
||||||
|
if len(sentence.split(" ")) == 1:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if sentence != scenes[scenes_cur].get("scene"):
|
||||||
|
if sentence == scenes[scenes_cur+1].get("scene"):
|
||||||
|
scenes_cur += 1
|
||||||
|
else:
|
||||||
|
print(f"Error, Mismatch\n=> scenes[{scenes_cur}] != \"{sentence}\"")
|
||||||
|
return
|
||||||
|
|
||||||
|
current_scene = scenes[scenes_cur]
|
||||||
|
if current_scene["timestamp"]:
|
||||||
|
word_idx = current_scene["timestamp"][-1]["index"] + 1
|
||||||
|
else:
|
||||||
|
word_idx = 0
|
||||||
|
|
||||||
|
if "<u>" in subtitle["content"]:
|
||||||
|
word = subtitle["content"].split("<u>")[1].split("</u>")[0]
|
||||||
|
if word not in sentence:
|
||||||
|
print(f"Error, Mismatch\n=> \"{word}\" not in \"{sentence}\"")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
assert sentence.split(" ")[word_idx] == word
|
||||||
|
except:
|
||||||
|
print(f"Error, Mismatch\n=> \"{word}\" != [{word_idx}] of \"{sentence}\"")
|
||||||
|
return
|
||||||
|
|
||||||
|
word_time = {"start": subtitle["start"], "end": subtitle["end"], "index": word_idx, "word": word}
|
||||||
|
current_scene["timestamp"].append(word_time)
|
||||||
|
|
||||||
|
# print(json.dumps(scenes, indent=2))
|
||||||
|
|
||||||
|
for scene in scenes:
|
||||||
|
if len(scene["scene"].split(" ")) != len(scene["timestamp"]):
|
||||||
|
print("Error, Mismatch length")
|
||||||
|
return
|
||||||
|
|
||||||
|
full_script, full_scenes = [], []
|
||||||
|
for scene in scenes:
|
||||||
|
full_script += scene["scene"].split(" ")
|
||||||
|
full_scenes += scene["timestamp"]
|
||||||
|
|
||||||
|
for i, j in zip(full_script, full_scenes):
|
||||||
|
if i != j["word"]:
|
||||||
|
print("Error, Mismatch")
|
||||||
|
return
|
||||||
|
|
||||||
|
assert len(full_scenes) == len(full_script)
|
||||||
|
|
||||||
|
return full_script, full_scenes
|
||||||
|
|
||||||
|
def scene_from_new_script(raw_script, full_script, full_scenes):
|
||||||
|
mod_script = raw_script.replace("\n", " \n ").split(" ")
|
||||||
|
mod_script = [i for i in mod_script if i]
|
||||||
|
n = 0
|
||||||
|
while True:
|
||||||
|
if mod_script[n] == "\n":
|
||||||
|
mod_script[n-1] += "\n"
|
||||||
|
del(mod_script[n])
|
||||||
|
n -= 1
|
||||||
|
n += 1
|
||||||
|
if n == len(mod_script):
|
||||||
|
break
|
||||||
|
# print(mod_script)
|
||||||
|
print(f"Original: {len(full_script)}, Modded: {len(mod_script)}")
|
||||||
|
allowed_list = [".", "\n", "\n\n", ","]
|
||||||
|
|
||||||
|
def normalized(x):
|
||||||
|
for i in allowed_list:
|
||||||
|
x = x.replace(i, "")
|
||||||
|
return x.upper()
|
||||||
|
|
||||||
|
same = lambda a, b: normalized(a) == normalized(b)
|
||||||
|
new_script, new_timestamp, orig_index, n = [], [], 0, 0
|
||||||
|
while n < len(mod_script):
|
||||||
|
# print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}")
|
||||||
|
word = mod_script[n]
|
||||||
|
if same(word, full_script[orig_index]):
|
||||||
|
cur = full_scenes[orig_index]
|
||||||
|
new_script.append(word)
|
||||||
|
new_timestamp.append({"start": cur["start"], "end": cur["end"]})
|
||||||
|
else:
|
||||||
|
# print("Back")
|
||||||
|
n -= 1
|
||||||
|
n, orig_index = n+1, orig_index+1
|
||||||
|
|
||||||
|
assert len(new_script) == len(new_timestamp)
|
||||||
|
return new_script, new_timestamp
|
||||||
|
|
||||||
|
def build_new_subtitle(new_script, new_timestamp):
|
||||||
|
buffer, new_scenes, start, end = [], [], None, None
|
||||||
|
current_scene = []
|
||||||
|
# print(" ".join(new_script).split("\n"))
|
||||||
|
|
||||||
|
for i, j in zip(new_script, new_timestamp):
|
||||||
|
if "\n" in i:
|
||||||
|
buffer.append(i.replace("\n", ""))
|
||||||
|
current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]})
|
||||||
|
buffer, start = [], None
|
||||||
|
if "\n\n" in i:
|
||||||
|
print(f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"")
|
||||||
|
new_scenes.append(current_scene)
|
||||||
|
current_scene = []
|
||||||
|
else:
|
||||||
|
buffer.append(i)
|
||||||
|
if not start:
|
||||||
|
start = j["start"]
|
||||||
|
|
||||||
|
if start:
|
||||||
|
buffer.append(i.replace("\n", ""))
|
||||||
|
current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]})
|
||||||
|
|
||||||
|
if current_scene != (new_scenes[-1] if new_scenes else None):
|
||||||
|
new_scenes.append(current_scene)
|
||||||
|
|
||||||
|
# print("\n\n".join(["\n".join([j["content"] for j in i]) for i in new_scenes]))
|
||||||
|
newsub = []
|
||||||
|
for n, i in enumerate(new_scenes):
|
||||||
|
newsub += i
|
||||||
|
if n < len(new_scenes) - 1:
|
||||||
|
newsub.append({"content": "Break", "start": None, "end": None, "split": True})
|
||||||
|
|
||||||
|
return newsub
|
||||||
|
|
||||||
|
def saveFile(filename, data, override = False):
|
||||||
|
if os.path.exists(filename) and not override:
|
||||||
|
print(f"File {filename} already exists.")
|
||||||
|
return -1
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
f.write(data)
|
||||||
|
|
||||||
|
def openFile(filename):
|
||||||
|
with open(filename, "r") as f:
|
||||||
|
data = f.read()
|
||||||
|
if not data:
|
||||||
|
return -1
|
||||||
|
return data
|
||||||
|
|
||||||
|
def main():
|
||||||
|
vttfile = "test.vtt"
|
||||||
|
scriptfile = "test.txt"
|
||||||
|
modfile = "test.script"
|
||||||
|
|
||||||
|
full_script, full_scenes = create_word_scenes(openFile(vttfile), openFile(scriptfile))
|
||||||
|
saveFile("test.script", " ".join(full_script).replace(". ", ".\n"))
|
||||||
|
a, b = scene_from_new_script(openFile(modfile), full_script, full_scenes)
|
||||||
|
final_vtt = build_new_subtitle(a, b)
|
||||||
|
# print(final_vtt)
|
||||||
|
saveFile("test.final.vtt", to_vtt(final_vtt), True)
|
||||||
|
saveFile("test.final.json", json.dumps(final_vtt, indent=2), True)
|
||||||
|
|
||||||
|
if __name__=="__main__":
|
||||||
|
main()
|
Loading…
Reference in New Issue