Fix
This commit is contained in:
parent
368c26d491
commit
826aca5c9f
41
backup.py
41
backup.py
|
@ -1,11 +1,12 @@
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
def parse_vtt(vtt_filename):
|
def parse_vtt(vtt_filename):
|
||||||
with open(vtt_filename, 'r', encoding='utf-8') as file:
|
with open(vtt_filename, "r", encoding="utf-8") as file:
|
||||||
lines = file.readlines()
|
lines = file.readlines()
|
||||||
|
|
||||||
time_pattern = re.compile(r'(\d+\.\d{3}) --> (\d+\.\d{3})')
|
time_pattern = re.compile(r"(\d+\.\d{3}) --> (\d+\.\d{3})")
|
||||||
|
|
||||||
subtitles = []
|
subtitles = []
|
||||||
current_subtitle = {}
|
current_subtitle = {}
|
||||||
|
@ -13,43 +14,47 @@ def parse_vtt(vtt_filename):
|
||||||
for line in lines[1:]:
|
for line in lines[1:]:
|
||||||
match = time_pattern.match(line)
|
match = time_pattern.match(line)
|
||||||
if match:
|
if match:
|
||||||
current_subtitle['start'] = float(match.group(1))
|
current_subtitle["start"] = float(match.group(1))
|
||||||
current_subtitle['end'] = float(match.group(2))
|
current_subtitle["end"] = float(match.group(2))
|
||||||
current_subtitle['content'] = ""
|
current_subtitle["content"] = ""
|
||||||
elif line.strip() == '':
|
elif line.strip() == "":
|
||||||
if current_subtitle:
|
if current_subtitle:
|
||||||
if current_subtitle['content'][-1] == "\n":
|
if current_subtitle["content"][-1] == "\n":
|
||||||
current_subtitle['content'] = current_subtitle['content'][:-1]
|
current_subtitle["content"] = current_subtitle["content"][:-1]
|
||||||
subtitles.append(current_subtitle)
|
subtitles.append(current_subtitle)
|
||||||
current_subtitle = {}
|
current_subtitle = {}
|
||||||
else:
|
else:
|
||||||
current_subtitle['content'] += line.strip() + "\n" # Space to separate lines
|
current_subtitle["content"] += (
|
||||||
|
line.strip() + "\n"
|
||||||
|
) # Space to separate lines
|
||||||
|
|
||||||
if current_subtitle:
|
if current_subtitle:
|
||||||
if current_subtitle['content'][-1] == "\n":
|
if current_subtitle["content"][-1] == "\n":
|
||||||
current_subtitle['content'] = current_subtitle['content'][:-1]
|
current_subtitle["content"] = current_subtitle["content"][:-1]
|
||||||
subtitles.append(current_subtitle)
|
subtitles.append(current_subtitle)
|
||||||
|
|
||||||
return subtitles
|
return subtitles
|
||||||
|
|
||||||
def subtitles_to_backup(subtitles):
|
|
||||||
|
|
||||||
|
def subtitles_to_backup(subtitles):
|
||||||
backup_data = {
|
backup_data = {
|
||||||
"subtitles": subtitles,
|
"subtitles": subtitles,
|
||||||
"script_lines": [],
|
"script_lines": [],
|
||||||
"line_index": len(subtitles),
|
"line_index": len(subtitles),
|
||||||
"current_subtitle": {},
|
"current_subtitle": {},
|
||||||
"play": 0
|
"play": 0,
|
||||||
}
|
}
|
||||||
return backup_data
|
return backup_data
|
||||||
|
|
||||||
|
|
||||||
def main(vtt_filename, output_filename):
|
def main(vtt_filename, output_filename):
|
||||||
subtitles = parse_vtt(vtt_filename)
|
subtitles = parse_vtt(vtt_filename)
|
||||||
backup_data = subtitles_to_backup(subtitles)
|
backup_data = subtitles_to_backup(subtitles)
|
||||||
|
|
||||||
with open(output_filename, 'w', encoding='utf-8') as json_file:
|
with open(output_filename, "w", encoding="utf-8") as json_file:
|
||||||
json.dump(backup_data, json_file, indent=2)
|
json.dump(backup_data, json_file, indent=2)
|
||||||
|
|
||||||
vtt_filename = 'audio.vtt'
|
|
||||||
output_filename = 'backup2.json'
|
vtt_filename = "audio.vtt"
|
||||||
|
output_filename = "backup2.json"
|
||||||
main(vtt_filename, output_filename)
|
main(vtt_filename, output_filename)
|
||||||
|
|
717
snusub.py
717
snusub.py
|
@ -6,410 +6,479 @@ from datetime import timedelta
|
||||||
|
|
||||||
###
|
###
|
||||||
|
|
||||||
|
|
||||||
def from_vtt(vtt_string):
|
def from_vtt(vtt_string):
|
||||||
VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
|
VTT_TIMECODE_PATTERN = (
|
||||||
VTT_LINE_NUMBER_PATTERN = r"^\d+$"
|
r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
|
||||||
parts = re.split(r'\n\n+', vtt_string.strip())
|
)
|
||||||
if parts[0].startswith('WEBVTT'):
|
VTT_LINE_NUMBER_PATTERN = r"^\d+$"
|
||||||
parts.pop(0)
|
parts = re.split(r"\n\n+", vtt_string.strip())
|
||||||
|
if parts[0].startswith("WEBVTT"):
|
||||||
|
parts.pop(0)
|
||||||
|
|
||||||
subtitles = []
|
subtitles = []
|
||||||
for part in parts:
|
for part in parts:
|
||||||
lines = part.split('\n')
|
lines = part.split("\n")
|
||||||
match = re.match(VTT_TIMECODE_PATTERN, lines[0])
|
match = re.match(VTT_TIMECODE_PATTERN, lines[0])
|
||||||
if not match:
|
if not match:
|
||||||
if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]):
|
if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]):
|
||||||
lines.pop(0)
|
lines.pop(0)
|
||||||
match = re.match(VTT_TIMECODE_PATTERN, lines[0])
|
match = re.match(VTT_TIMECODE_PATTERN, lines[0])
|
||||||
if not match:
|
if not match:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
start, end = match.groups()
|
start, end = match.groups()
|
||||||
content = '\n'.join(lines[1:]) + "\n"
|
content = "\n".join(lines[1:]) + "\n"
|
||||||
# if start == end:
|
# if start == end:
|
||||||
# continue
|
# continue
|
||||||
|
|
||||||
subtitles.append({
|
subtitles.append(
|
||||||
'start': start,
|
{
|
||||||
'end': end,
|
"start": start,
|
||||||
'content': (content.replace("-\n", "\n").replace("</u>-\n", "</u>\n").replace("-", " ").replace("%", " ").replace("<u> "," <u>").replace(" </u>","</u> ").replace("<u> </u>","").replace("<u></u>","").replace(" \n", "\n"))[:-1]
|
"end": end,
|
||||||
})
|
"content": (
|
||||||
|
content.replace("-\n", "\n")
|
||||||
|
.replace("</u>-\n", "</u>\n")
|
||||||
|
.replace("-", " ")
|
||||||
|
.replace("%", " ")
|
||||||
|
.replace("<u> ", " <u>")
|
||||||
|
.replace(" </u>", "</u> ")
|
||||||
|
.replace("<u> </u>", "")
|
||||||
|
.replace("<u></u>", "")
|
||||||
|
.replace(" \n", "\n")
|
||||||
|
)[:-1],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return subtitles
|
||||||
|
|
||||||
return subtitles
|
|
||||||
|
|
||||||
def to_vtt(subtitles):
|
def to_vtt(subtitles):
|
||||||
vtt_content = "WEBVTT\n\n\n"
|
vtt_content = "WEBVTT\n\n\n"
|
||||||
for idx, subtitle in enumerate(subtitles):
|
for idx, subtitle in enumerate(subtitles):
|
||||||
content = subtitle['content']
|
content = subtitle["content"]
|
||||||
if not subtitle.get("split", False):
|
if not subtitle.get("split", False):
|
||||||
start = subtitle['start']
|
start = subtitle["start"]
|
||||||
end = subtitle['end']
|
end = subtitle["end"]
|
||||||
if not start or not end or start == end:
|
if not start or not end or start == end:
|
||||||
raise Exception(f"VTT timestamp parse error from #{idx}.")
|
raise Exception(f"VTT timestamp parse error from #{idx}.")
|
||||||
vtt_content += f"{start} --> {end}\n{content}\n\n\n"
|
vtt_content += f"{start} --> {end}\n{content}\n\n\n"
|
||||||
else:
|
else:
|
||||||
vtt_content += f"NOTE {content}\n\n\n"
|
vtt_content += f"NOTE {content}\n\n\n"
|
||||||
|
|
||||||
return vtt_content.strip()
|
return vtt_content.strip()
|
||||||
|
|
||||||
def to_stacked_vtt(subtitles, continous = True):
|
|
||||||
vtt_content = "WEBVTT\n\n\n"
|
|
||||||
buffer = ""
|
|
||||||
for n, subtitle in enumerate(subtitles):
|
|
||||||
if subtitle.get("split", False):
|
|
||||||
buffer = ""
|
|
||||||
continue
|
|
||||||
|
|
||||||
if len(buffer) != 0:
|
def to_stacked_vtt(subtitles, continous=True):
|
||||||
if str(subtitle['content'].strip())[-1] == ".":
|
vtt_content = "WEBVTT\n\n\n"
|
||||||
buffer += "\n"
|
buffer = ""
|
||||||
else:
|
for n, subtitle in enumerate(subtitles):
|
||||||
buffer += " "
|
if subtitle.get("split", False):
|
||||||
|
buffer = ""
|
||||||
|
continue
|
||||||
|
|
||||||
buffer += subtitle['content'].strip()
|
if len(buffer) != 0:
|
||||||
|
if str(subtitle["content"].strip())[-1] == ".":
|
||||||
|
buffer += "\n"
|
||||||
|
else:
|
||||||
|
buffer += " "
|
||||||
|
|
||||||
if n < len(subtitles) - 1:
|
buffer += subtitle["content"].strip()
|
||||||
end_time = subtitles[n+1]['start'] if continous and not subtitles[n+1].get("split", False) else subtitle['end']
|
|
||||||
else:
|
|
||||||
end_time = subtitle['end']
|
|
||||||
|
|
||||||
if not subtitle['start'] or not end_time:
|
|
||||||
raise Exception(f"VTT timestamp parse error from #{idx}.")
|
|
||||||
if subtitle['start'] == end_time:
|
|
||||||
raise Exception(f"Error, subtitle timestamp overlaps.\n{subtitle['start']} --> {end_time} {subtitle['content'].strip()}")
|
|
||||||
vtt_content += f"{subtitle['start']} --> {end_time}\n"
|
|
||||||
vtt_content += buffer
|
|
||||||
vtt_content += "\n\n\n"
|
|
||||||
|
|
||||||
print(f"{subtitle['start']} --> {end_time}\n{buffer}\n\n")
|
if n < len(subtitles) - 1:
|
||||||
|
end_time = (
|
||||||
|
subtitles[n + 1]["start"]
|
||||||
|
if continous and not subtitles[n + 1].get("split", False)
|
||||||
|
else subtitle["end"]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
end_time = subtitle["end"]
|
||||||
|
|
||||||
|
if not subtitle["start"] or not end_time:
|
||||||
|
raise Exception(f"VTT timestamp parse error from #{idx}.")
|
||||||
|
if subtitle["start"] == end_time:
|
||||||
|
raise Exception(
|
||||||
|
f"Error, subtitle timestamp overlaps.\n{subtitle['start']} --> {end_time} {subtitle['content'].strip()}"
|
||||||
|
)
|
||||||
|
vtt_content += f"{subtitle['start']} --> {end_time}\n"
|
||||||
|
vtt_content += buffer
|
||||||
|
vtt_content += "\n\n\n"
|
||||||
|
|
||||||
|
print(f"{subtitle['start']} --> {end_time}\n{buffer}\n\n")
|
||||||
|
|
||||||
|
return vtt_content
|
||||||
|
|
||||||
return vtt_content
|
|
||||||
|
|
||||||
###
|
###
|
||||||
|
|
||||||
|
|
||||||
def script_from_word_vtt(wordvtt):
|
def script_from_word_vtt(wordvtt):
|
||||||
subtitles = from_vtt(wordvtt)
|
subtitles = from_vtt(wordvtt)
|
||||||
print(f"Generating script file from VTT...")
|
print(f"Generating script file from VTT...")
|
||||||
sentences = []
|
sentences = []
|
||||||
ADD_NEXT_SENTENCE = 0
|
ADD_NEXT_SENTENCE = 0
|
||||||
for n, subtitle in enumerate(subtitles):
|
for n, subtitle in enumerate(subtitles):
|
||||||
sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
|
sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
|
||||||
if ((sentences[-1] if sentences else None) != sentence) or ADD_NEXT_SENTENCE:
|
if ((sentences[-1] if sentences else None) != sentence) or ADD_NEXT_SENTENCE:
|
||||||
sentences.append(sentence)
|
sentences.append(sentence)
|
||||||
ADD_NEXT_SENTENCE = 0
|
ADD_NEXT_SENTENCE = 0
|
||||||
if subtitle["content"][-4:] == "</u>":
|
if subtitle["content"][-4:] == "</u>":
|
||||||
ADD_NEXT_SENTENCE = 1
|
ADD_NEXT_SENTENCE = 1
|
||||||
if n + 2 < len(subtitles):
|
if n + 2 < len(subtitles):
|
||||||
if subtitles[n+2]["content"].replace("<u>", "").replace("</u>", "") != sentence:
|
if (
|
||||||
ADD_NEXT_SENTENCE = 0
|
subtitles[n + 2]["content"].replace("<u>", "").replace("</u>", "")
|
||||||
return sentences
|
!= sentence
|
||||||
|
):
|
||||||
|
ADD_NEXT_SENTENCE = 0
|
||||||
|
return sentences
|
||||||
|
|
||||||
|
|
||||||
def create_word_scenes(raw_vtt, raw_script):
|
def create_word_scenes(raw_vtt, raw_script):
|
||||||
subtitles = from_vtt(raw_vtt)
|
subtitles = from_vtt(raw_vtt)
|
||||||
scripts = [i for i in raw_script.split("\n") if i]
|
scripts = [i for i in raw_script.split("\n") if i]
|
||||||
print(f"Found {len(subtitles)} subtitles, {len(scripts)} scenes.\nTimestamping each words...")
|
print(
|
||||||
|
f"Found {len(subtitles)} subtitles, {len(scripts)} scenes.\nTimestamping each words..."
|
||||||
|
)
|
||||||
|
|
||||||
scenes = []
|
scenes = []
|
||||||
for n, script in enumerate(scripts):
|
for n, script in enumerate(scripts):
|
||||||
if len(script.split(" ")) == 1:
|
if len(script.split(" ")) == 1:
|
||||||
continue
|
continue
|
||||||
scenes.append({"scene": script, "timestamp": []})
|
scenes.append({"scene": script, "timestamp": []})
|
||||||
|
|
||||||
scenes_cur = 0
|
scenes_cur = 0
|
||||||
for n, subtitle in enumerate(subtitles):
|
for n, subtitle in enumerate(subtitles):
|
||||||
sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
|
sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
|
||||||
if len(sentence.split(" ")) == 1:
|
if len(sentence.split(" ")) == 1:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if sentence != scenes[scenes_cur].get("scene"):
|
if sentence != scenes[scenes_cur].get("scene"):
|
||||||
if sentence == scenes[scenes_cur+1].get("scene"):
|
if sentence == scenes[scenes_cur + 1].get("scene"):
|
||||||
scenes_cur += 1
|
scenes_cur += 1
|
||||||
else:
|
else:
|
||||||
raise Exception(f"Error, Failed to match sentence with scene.\n\"{scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\"")
|
raise Exception(
|
||||||
|
f"Error, Failed to match sentence with scene.\n\"{scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\""
|
||||||
|
)
|
||||||
|
|
||||||
current_scene = scenes[scenes_cur]
|
current_scene = scenes[scenes_cur]
|
||||||
if current_scene["timestamp"]:
|
if current_scene["timestamp"]:
|
||||||
word_idx = current_scene["timestamp"][-1]["index"] + 1
|
word_idx = current_scene["timestamp"][-1]["index"] + 1
|
||||||
else:
|
else:
|
||||||
word_idx = 0
|
word_idx = 0
|
||||||
|
|
||||||
if ("<u>" not in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
|
if ("<u>" not in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
|
||||||
# Ignore trailing dummy subtitle after last word indexed.
|
# Ignore trailing dummy subtitle after last word indexed.
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if ("<u>" in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
|
if ("<u>" in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
|
||||||
# If there is trailing non-dummy timestamped subtitle, Reset word_idx and step to next scene. (Repeating sentence doesnt increment cur.)
|
# If there is trailing non-dummy timestamped subtitle, Reset word_idx and step to next scene. (Repeating sentence doesnt increment cur.)
|
||||||
print(f"Error, Index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}")
|
print(
|
||||||
word_idx = 0
|
f"Error, Index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}"
|
||||||
scenes_cur += 1
|
)
|
||||||
current_scene = scenes[scenes_cur]
|
word_idx = 0
|
||||||
if current_scene["timestamp"]:
|
scenes_cur += 1
|
||||||
word_idx = current_scene["timestamp"][-1]["index"] + 1
|
current_scene = scenes[scenes_cur]
|
||||||
else:
|
if current_scene["timestamp"]:
|
||||||
word_idx = 0
|
word_idx = current_scene["timestamp"][-1]["index"] + 1
|
||||||
print(f"Changed to {word_idx}, {scenes_cur}")
|
else:
|
||||||
|
word_idx = 0
|
||||||
|
print(f"Changed to {word_idx}, {scenes_cur}")
|
||||||
|
|
||||||
# Start matching words.
|
# Start matching words.
|
||||||
if "<u>" in subtitle["content"]:
|
if "<u>" in subtitle["content"]:
|
||||||
word = subtitle["content"].split("<u>")[1].split("</u>")[0]
|
word = subtitle["content"].split("<u>")[1].split("</u>")[0]
|
||||||
|
|
||||||
if word not in sentence.split(" "):
|
if word not in sentence.split(" "):
|
||||||
raise Exception(f"Error, Mismatch\n=> \"{word}\" not in \"{sentence}\"")
|
raise Exception(f'Error, Mismatch\n=> "{word}" not in "{sentence}"')
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
assert sentence.split(" ")[word_idx] == word
|
assert sentence.split(" ")[word_idx] == word
|
||||||
except:
|
except:
|
||||||
raise Exception(f"Error, Mismatch\n=> \"{word}\" != [{word_idx}] of \"{sentence}\"")
|
raise Exception(
|
||||||
|
f'Error, Mismatch\n=> "{word}" != [{word_idx}] of "{sentence}"'
|
||||||
|
)
|
||||||
|
|
||||||
word_time = {"start": subtitle["start"], "end": subtitle["end"], "index": word_idx, "word": word}
|
word_time = {
|
||||||
current_scene["timestamp"].append(word_time)
|
"start": subtitle["start"],
|
||||||
|
"end": subtitle["end"],
|
||||||
|
"index": word_idx,
|
||||||
|
"word": word,
|
||||||
|
}
|
||||||
|
current_scene["timestamp"].append(word_time)
|
||||||
|
|
||||||
for scene in scenes:
|
for scene in scenes:
|
||||||
if len(scene["scene"].split(" ")) != len(scene["timestamp"]):
|
if len(scene["scene"].split(" ")) != len(scene["timestamp"]):
|
||||||
raise Exception("Error, Scene length and timestamp length doesnt match.")
|
raise Exception("Error, Scene length and timestamp length doesnt match.")
|
||||||
if "" in scene["scene"].split(" "):
|
if "" in scene["scene"].split(" "):
|
||||||
print(repr(scene["scene"]))
|
print(repr(scene["scene"]))
|
||||||
|
|
||||||
full_script, full_scenes = [], []
|
full_script, full_scenes = [], []
|
||||||
for scene in scenes:
|
for scene in scenes:
|
||||||
full_script += scene["scene"].split(" ")[:-1]
|
full_script += scene["scene"].split(" ")[:-1]
|
||||||
full_script.append(scene["scene"].split(" ")[-1]+"##")
|
full_script.append(scene["scene"].split(" ")[-1] + "##")
|
||||||
full_scenes += scene["timestamp"]
|
full_scenes += scene["timestamp"]
|
||||||
|
|
||||||
for i, j in zip(full_script, full_scenes):
|
for i, j in zip(full_script, full_scenes):
|
||||||
if i.replace("##", "") != j["word"]:
|
if i.replace("##", "") != j["word"]:
|
||||||
raise Exception("Error, Mismatch")
|
raise Exception("Error, Mismatch")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
assert len(full_scenes) == len(full_script)
|
||||||
|
|
||||||
|
return full_script, full_scenes
|
||||||
|
|
||||||
assert len(full_scenes) == len(full_script)
|
|
||||||
|
|
||||||
return full_script, full_scenes
|
|
||||||
|
|
||||||
def scene_from_new_script(raw_script, full_script, full_scenes):
|
def scene_from_new_script(raw_script, full_script, full_scenes):
|
||||||
mod_script = raw_script.replace("\n", " \n ").split(" ")
|
mod_script = raw_script.replace("\n", " \n ").split(" ")
|
||||||
mod_script = [i for i in mod_script if i]
|
mod_script = [i for i in mod_script if i]
|
||||||
n = 0
|
n = 0
|
||||||
while True:
|
while True:
|
||||||
if mod_script[n] == "\n":
|
if mod_script[n] == "\n":
|
||||||
mod_script[n-1] += "\n"
|
mod_script[n - 1] += "\n"
|
||||||
del(mod_script[n])
|
del mod_script[n]
|
||||||
n -= 1
|
n -= 1
|
||||||
n += 1
|
n += 1
|
||||||
if n == len(mod_script):
|
if n == len(mod_script):
|
||||||
break
|
break
|
||||||
|
|
||||||
print(f"Original: {len(full_script)}, Modded: {len(mod_script)}")
|
print(f"Original: {len(full_script)}, Modded: {len(mod_script)}")
|
||||||
allowed_list = [".", "\n", "\n\n", ",", "?", "##"]
|
allowed_list = [".", "\n", "\n\n", ",", "?", "##"]
|
||||||
|
|
||||||
|
def normalized(x):
|
||||||
|
for i in allowed_list:
|
||||||
|
x = x.replace(i, "")
|
||||||
|
return x.upper()
|
||||||
|
|
||||||
|
same = lambda a, b: normalized(a) == normalized(b)
|
||||||
|
new_script, new_timestamp, orig_index, n = [], [], 0, 0
|
||||||
|
fail = 0
|
||||||
|
while n < len(mod_script):
|
||||||
|
print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}")
|
||||||
|
word = mod_script[n]
|
||||||
|
if same(word, full_script[orig_index].replace("##", "")):
|
||||||
|
cur = full_scenes[orig_index]
|
||||||
|
new_script.append(word.replace("##", ""))
|
||||||
|
new_timestamp.append({"start": cur["start"], "end": cur["end"]})
|
||||||
|
fail = 0
|
||||||
|
else:
|
||||||
|
if fail > 10:
|
||||||
|
raise Exception("Error: Failed to match words,")
|
||||||
|
return
|
||||||
|
fail += 1
|
||||||
|
n -= 1
|
||||||
|
n, orig_index = n + 1, orig_index + 1
|
||||||
|
assert len(new_script) == len(new_timestamp)
|
||||||
|
return new_script, new_timestamp
|
||||||
|
|
||||||
def normalized(x):
|
|
||||||
for i in allowed_list:
|
|
||||||
x = x.replace(i, "")
|
|
||||||
return x.upper()
|
|
||||||
|
|
||||||
same = lambda a, b: normalized(a) == normalized(b)
|
|
||||||
new_script, new_timestamp, orig_index, n = [], [], 0, 0
|
|
||||||
fail = 0
|
|
||||||
while n < len(mod_script):
|
|
||||||
print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}")
|
|
||||||
word = mod_script[n]
|
|
||||||
if same(word, full_script[orig_index].replace("##", "")):
|
|
||||||
cur = full_scenes[orig_index]
|
|
||||||
new_script.append(word.replace("##", ""))
|
|
||||||
new_timestamp.append({"start": cur["start"], "end": cur["end"]})
|
|
||||||
fail = 0
|
|
||||||
else:
|
|
||||||
if fail > 10:
|
|
||||||
raise Exception("Error: Failed to match words,")
|
|
||||||
return
|
|
||||||
fail += 1
|
|
||||||
n -= 1
|
|
||||||
n, orig_index = n+1, orig_index+1
|
|
||||||
assert len(new_script) == len(new_timestamp)
|
|
||||||
return new_script, new_timestamp
|
|
||||||
|
|
||||||
def build_new_subtitle(new_script, new_timestamp):
|
def build_new_subtitle(new_script, new_timestamp):
|
||||||
buffer, new_scenes, start, end = [], [], None, None
|
buffer, new_scenes, start, end = [], [], None, None
|
||||||
current_scene = []
|
current_scene = []
|
||||||
|
|
||||||
for i, j in zip(new_script, new_timestamp):
|
|
||||||
buffer.append(i.replace("\n", ""))
|
|
||||||
if not start:
|
|
||||||
start = j["start"]
|
|
||||||
|
|
||||||
if "\n" in i:
|
for i, j in zip(new_script, new_timestamp):
|
||||||
current_scene.append({"content": " ".join(buffer).replace("##", ""), "start": start, "end": j["end"]})
|
buffer.append(i.replace("\n", ""))
|
||||||
buffer, start = [], None
|
if not start:
|
||||||
|
start = j["start"]
|
||||||
if "\n\n" in i:
|
|
||||||
print(f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"")
|
|
||||||
new_scenes.append(current_scene)
|
|
||||||
current_scene = []
|
|
||||||
|
|
||||||
if start:
|
if "\n" in i:
|
||||||
buffer.append(i.replace("\n", ""))
|
current_scene.append(
|
||||||
current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]})
|
{
|
||||||
|
"content": " ".join(buffer).replace("##", ""),
|
||||||
|
"start": start,
|
||||||
|
"end": j["end"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
buffer, start = [], None
|
||||||
|
|
||||||
if current_scene != (new_scenes[-1] if new_scenes else None):
|
if "\n\n" in i:
|
||||||
new_scenes.append(current_scene)
|
print(
|
||||||
|
f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\""
|
||||||
|
)
|
||||||
|
new_scenes.append(current_scene)
|
||||||
|
current_scene = []
|
||||||
|
|
||||||
newsub = []
|
if start:
|
||||||
for n, i in enumerate(new_scenes):
|
buffer.append(i.replace("\n", ""))
|
||||||
newsub += i
|
current_scene.append(
|
||||||
if n < len(new_scenes) - 1:
|
{"content": " ".join(buffer), "start": start, "end": j["end"]}
|
||||||
newsub.append({"content": "Break", "start": None, "end": None, "split": True})
|
)
|
||||||
|
|
||||||
|
if current_scene != (new_scenes[-1] if new_scenes else None):
|
||||||
|
new_scenes.append(current_scene)
|
||||||
|
|
||||||
|
newsub = []
|
||||||
|
for n, i in enumerate(new_scenes):
|
||||||
|
newsub += i
|
||||||
|
if n < len(new_scenes) - 1:
|
||||||
|
newsub.append(
|
||||||
|
{"content": "Break", "start": None, "end": None, "split": True}
|
||||||
|
)
|
||||||
|
|
||||||
|
return newsub
|
||||||
|
|
||||||
return newsub
|
|
||||||
|
|
||||||
###
|
###
|
||||||
|
|
||||||
|
|
||||||
def autobreak(lines, times):
|
def autobreak(lines, times):
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
|
||||||
def parsetime(time_str):
|
def parsetime(time_str):
|
||||||
minutes, seconds = time_str.split(':')
|
minutes, seconds = time_str.split(":")
|
||||||
seconds, milliseconds = seconds.split('.')
|
seconds, milliseconds = seconds.split(".")
|
||||||
td = timedelta(minutes=int(minutes), seconds=int(seconds), milliseconds=int(milliseconds))
|
td = timedelta(
|
||||||
return td
|
minutes=int(minutes), seconds=int(seconds), milliseconds=int(milliseconds)
|
||||||
|
)
|
||||||
|
return td
|
||||||
|
|
||||||
script = []
|
script = []
|
||||||
long_breaks = []
|
long_breaks = []
|
||||||
tmark = parsetime("0:0.0")
|
tmark = parsetime("0:0.0")
|
||||||
for i, j in zip(lines, times):
|
for i, j in zip(lines, times):
|
||||||
tdiff = parsetime(j["start"]) - tmark
|
tdiff = parsetime(j["start"]) - tmark
|
||||||
tmark = parsetime(j["end"])
|
tmark = parsetime(j["end"])
|
||||||
if tdiff > parsetime("0:0.0"):
|
if tdiff > parsetime("0:0.0"):
|
||||||
long_breaks.append(tdiff)
|
long_breaks.append(tdiff)
|
||||||
|
|
||||||
mean_break = parsetime("0:0.0")
|
mean_break = parsetime("0:0.0")
|
||||||
for i in long_breaks:
|
for i in long_breaks:
|
||||||
mean_break += i/len(long_breaks)
|
mean_break += i / len(long_breaks)
|
||||||
print(mean_break)
|
print(mean_break)
|
||||||
|
|
||||||
script = ""
|
script = ""
|
||||||
tmark = parsetime("0:0.0")
|
tmark = parsetime("0:0.0")
|
||||||
tmp = " "
|
tmp = " "
|
||||||
|
|
||||||
continous_line = 0
|
continous_line = 0
|
||||||
for i, j in zip(lines, times):
|
for i, j in zip(lines, times):
|
||||||
tdiff = parsetime(j["start"]) - tmark
|
tdiff = parsetime(j["start"]) - tmark
|
||||||
tmark = parsetime(j["end"])
|
tmark = parsetime(j["end"])
|
||||||
if tdiff > mean_break and tmp[-1] != ".":
|
if tdiff > mean_break and tmp[-1] != ".":
|
||||||
script += "\n"
|
script += "\n"
|
||||||
|
|
||||||
if (tdiff >= mean_break and tmp[-1] == "."):
|
if tdiff >= mean_break and tmp[-1] == ".":
|
||||||
script += "\n"
|
script += "\n"
|
||||||
continous_line = 0
|
continous_line = 0
|
||||||
else:
|
else:
|
||||||
continous_line += 1
|
continous_line += 1
|
||||||
|
|
||||||
script += i.replace("##", "")
|
script += i.replace("##", "")
|
||||||
|
|
||||||
|
if i[-1] == ".":
|
||||||
|
script += "\n"
|
||||||
|
elif "##" in i:
|
||||||
|
script += "\n"
|
||||||
|
else:
|
||||||
|
script += " "
|
||||||
|
tmp = i
|
||||||
|
|
||||||
|
return script
|
||||||
|
|
||||||
if i[-1] == ".":
|
|
||||||
script += "\n"
|
|
||||||
elif "##" in i:
|
|
||||||
script += "\n"
|
|
||||||
else:
|
|
||||||
script += " "
|
|
||||||
tmp = i
|
|
||||||
|
|
||||||
return script
|
|
||||||
|
|
||||||
############################################
|
############################################
|
||||||
|
|
||||||
def saveFile(filename, data, override = False):
|
|
||||||
if os.path.exists(filename) and not override:
|
def saveFile(filename, data, override=False):
|
||||||
raise Exception(f"File {filename} already exists.")
|
if os.path.exists(filename) and not override:
|
||||||
return
|
raise Exception(f"File {filename} already exists.")
|
||||||
with open(filename, "w") as f:
|
return
|
||||||
f.write(data)
|
with open(filename, "w") as f:
|
||||||
|
f.write(data)
|
||||||
|
|
||||||
|
|
||||||
def openFile(filename):
|
def openFile(filename):
|
||||||
if not os.path.exists(filename):
|
if not os.path.exists(filename):
|
||||||
raise Exception(f"File {filename} doesnt exists.")
|
raise Exception(f"File {filename} doesnt exists.")
|
||||||
return
|
return
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
data = f.read()
|
data = f.read()
|
||||||
if not data:
|
if not data:
|
||||||
raise Exception("Data empty.")
|
raise Exception("Data empty.")
|
||||||
return
|
return
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
############################################
|
############################################
|
||||||
|
|
||||||
if __name__=="__main__":
|
if __name__ == "__main__":
|
||||||
PROG = sys.argv[0].split("/")[-1]
|
PROG = sys.argv[0].split("/")[-1]
|
||||||
if len(sys.argv) not in (3, 4):
|
if len(sys.argv) not in (3, 4):
|
||||||
print( \
|
print(
|
||||||
f"""Usage: {PROG} [COMMAND] [FILES]...
|
f"""Usage: {PROG} [COMMAND] [FILES]...
|
||||||
|
|
||||||
Commands:
|
Commands:
|
||||||
- script <VTT file> Generates script file from vtt file.
|
- script <VTT file> Generates script file from vtt file.
|
||||||
- apply <VTT file> <script file> Applies new scripted file to create JSON file.
|
- apply <VTT file> <script file> Applies new scripted file to create JSON file.
|
||||||
- create <JSON file> Creates new vtt from given JSON.
|
- create <JSON file> Creates new vtt from given JSON.
|
||||||
""")
|
"""
|
||||||
sys.exit()
|
)
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
COMMAND = sys.argv[1]
|
COMMAND = sys.argv[1]
|
||||||
if COMMAND not in ["script", "apply", "create"]:
|
if COMMAND not in ["script", "apply", "create"]:
|
||||||
print("Error. Command not found.")
|
print("Error. Command not found.")
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
print(f"-> {sys.argv}")
|
print(f"-> {sys.argv}")
|
||||||
if COMMAND == "script":
|
if COMMAND == "script":
|
||||||
FILE = sys.argv[2]
|
FILE = sys.argv[2]
|
||||||
if (not os.path.exists(FILE)):
|
if not os.path.exists(FILE):
|
||||||
print(f"Input file doesnt exists.")
|
print(f"Input file doesnt exists.")
|
||||||
sys.exit(-1)
|
sys.exit(-1)
|
||||||
|
|
||||||
modfile = ".".join(scriptfile.split(".")[:-1]) + ".script"
|
modfile = ".".join(scriptfile.split(".")[:-1]) + ".script"
|
||||||
x = create_word_scenes(openFile(FILE), "\n".join(script_from_word_vtt(openFile(FILE))))
|
x = create_word_scenes(
|
||||||
if not x:
|
openFile(FILE), "\n".join(script_from_word_vtt(openFile(FILE)))
|
||||||
sys.exit(-1)
|
)
|
||||||
|
if not x:
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
full_script, full_scenes = x
|
full_script, full_scenes = x
|
||||||
genscript = autobreak(full_script, full_scenes)
|
genscript = autobreak(full_script, full_scenes)
|
||||||
saveFile(modfile, genscript)
|
saveFile(modfile, genscript)
|
||||||
print(f"Saved script file {modfile}.")
|
print(f"Saved script file {modfile}.")
|
||||||
|
|
||||||
elif COMMAND == "apply":
|
|
||||||
if len(sys.argv) != 4:
|
|
||||||
print(f"Not sufficient input.")
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
FILE1, FILE2 = sys.argv[2], sys.argv[3]
|
elif COMMAND == "apply":
|
||||||
if (not os.path.exists(FILE1)) or (not os.path.exists(FILE2)):
|
if len(sys.argv) != 4:
|
||||||
print(f"Input file doesnt exists.")
|
print(f"Not sufficient input.")
|
||||||
sys.exit(-1)
|
sys.exit()
|
||||||
|
|
||||||
x = create_word_scenes(openFile(FILE1), "\n".join(script_from_word_vtt(openFile(FILE1))))
|
FILE1, FILE2 = sys.argv[2], sys.argv[3]
|
||||||
if not x:
|
if (not os.path.exists(FILE1)) or (not os.path.exists(FILE2)):
|
||||||
sys.exit(-1)
|
print(f"Input file doesnt exists.")
|
||||||
full_script, full_scenes = x
|
sys.exit(-1)
|
||||||
|
|
||||||
x = scene_from_new_script(openFile(FILE2), full_script, full_scenes)
|
x = create_word_scenes(
|
||||||
if not x:
|
openFile(FILE1), "\n".join(script_from_word_vtt(openFile(FILE1)))
|
||||||
sys.exit(-1)
|
)
|
||||||
a, b = x
|
if not x:
|
||||||
|
sys.exit(-1)
|
||||||
|
full_script, full_scenes = x
|
||||||
|
|
||||||
final_sub = build_new_subtitle(a, b)
|
x = scene_from_new_script(openFile(FILE2), full_script, full_scenes)
|
||||||
jsonfile = ".".join(FILE1.split(".")[:-1]) + ".json"
|
if not x:
|
||||||
saveFile(jsonfile, json.dumps(final_sub, indent=2), True)
|
sys.exit(-1)
|
||||||
print(f"Saved JSON file {jsonfile}.")
|
a, b = x
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
elif COMMAND == "create":
|
|
||||||
FILE = sys.argv[2]
|
|
||||||
if (not os.path.exists(FILE)):
|
|
||||||
print(f"Input file doesnt exists.")
|
|
||||||
sys.exit(-1)
|
|
||||||
|
|
||||||
final_vtt = json.loads(openFile(FILE))
|
final_sub = build_new_subtitle(a, b)
|
||||||
orgf = ".".join(FILE.split(".")[:-1])
|
jsonfile = ".".join(FILE1.split(".")[:-1]) + ".json"
|
||||||
print(f"Saved VTT file as {orgf}.final.vtt.")
|
saveFile(jsonfile, json.dumps(final_sub, indent=2), True)
|
||||||
|
print(f"Saved JSON file {jsonfile}.")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
if os.path.exists(orgf + ".vtt"):
|
elif COMMAND == "create":
|
||||||
saveFile(orgf + ".stacked.vtt", to_stacked_vtt(final_vtt), True)
|
FILE = sys.argv[2]
|
||||||
else:
|
if not os.path.exists(FILE):
|
||||||
saveFile(orgf + ".vtt", to_stacked_vtt(final_vtt), True)
|
print(f"Input file doesnt exists.")
|
||||||
sys.exit(0)
|
sys.exit(-1)
|
||||||
|
|
||||||
|
final_vtt = json.loads(openFile(FILE))
|
||||||
|
orgf = ".".join(FILE.split(".")[:-1])
|
||||||
|
print(f"Saved VTT file as {orgf}.final.vtt.")
|
||||||
|
|
||||||
|
if os.path.exists(orgf + ".vtt"):
|
||||||
|
saveFile(orgf + ".stacked.vtt", to_stacked_vtt(final_vtt), True)
|
||||||
|
else:
|
||||||
|
saveFile(orgf + ".vtt", to_stacked_vtt(final_vtt), True)
|
||||||
|
sys.exit(0)
|
||||||
|
|
36
stackvtt.py
36
stackvtt.py
|
@ -1,18 +1,21 @@
|
||||||
import re
|
import re
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
|
||||||
VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
|
VTT_TIMECODE_PATTERN = (
|
||||||
|
r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
|
||||||
|
)
|
||||||
VTT_LINE_NUMBER_PATTERN = r"^\d+$"
|
VTT_LINE_NUMBER_PATTERN = r"^\d+$"
|
||||||
|
|
||||||
def from_vtt(vtt_string):
|
|
||||||
parts = re.split(r'\n\n+', vtt_string.strip())
|
|
||||||
|
|
||||||
if parts[0].startswith('WEBVTT'):
|
def from_vtt(vtt_string):
|
||||||
|
parts = re.split(r"\n\n+", vtt_string.strip())
|
||||||
|
|
||||||
|
if parts[0].startswith("WEBVTT"):
|
||||||
parts.pop(0)
|
parts.pop(0)
|
||||||
|
|
||||||
subtitles = []
|
subtitles = []
|
||||||
for part in parts:
|
for part in parts:
|
||||||
lines = part.split('\n')
|
lines = part.split("\n")
|
||||||
match = re.match(VTT_TIMECODE_PATTERN, lines[0])
|
match = re.match(VTT_TIMECODE_PATTERN, lines[0])
|
||||||
if not match:
|
if not match:
|
||||||
if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]):
|
if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]):
|
||||||
|
@ -22,30 +25,28 @@ def from_vtt(vtt_string):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
start, end = match.groups()
|
start, end = match.groups()
|
||||||
content = '\n'.join(lines[1:])
|
content = "\n".join(lines[1:])
|
||||||
|
|
||||||
subtitles.append({
|
subtitles.append({"start": start, "end": end, "content": content})
|
||||||
'start': start,
|
|
||||||
'end': end,
|
|
||||||
'content': content
|
|
||||||
})
|
|
||||||
|
|
||||||
return subtitles
|
return subtitles
|
||||||
|
|
||||||
|
|
||||||
def to_vtt(subtitles):
|
def to_vtt(subtitles):
|
||||||
vtt_content = "WEBVTT\n\n"
|
vtt_content = "WEBVTT\n\n"
|
||||||
for idx, subtitle in enumerate(subtitles):
|
for idx, subtitle in enumerate(subtitles):
|
||||||
start = subtitle['start']
|
start = subtitle["start"]
|
||||||
end = subtitle['end']
|
end = subtitle["end"]
|
||||||
content = subtitle['content']
|
content = subtitle["content"]
|
||||||
vtt_content += f"{start} --> {end}\n{content}\n\n"
|
vtt_content += f"{start} --> {end}\n{content}\n\n"
|
||||||
return vtt_content.strip()
|
return vtt_content.strip()
|
||||||
|
|
||||||
|
|
||||||
def stack_subtitle():
|
def stack_subtitle():
|
||||||
buffer = []
|
buffer = []
|
||||||
linebuf = []
|
linebuf = []
|
||||||
for line in parsed_vtt:
|
for line in parsed_vtt:
|
||||||
print(line["content"].strip())
|
print(line["content"].strip())
|
||||||
content = line["content"].strip()
|
content = line["content"].strip()
|
||||||
if True:
|
if True:
|
||||||
linebuf.append(line)
|
linebuf.append(line)
|
||||||
|
@ -62,12 +63,13 @@ def stack_subtitle():
|
||||||
# if scene["content"][-1] == ".":
|
# if scene["content"][-1] == ".":
|
||||||
strbuf += "\n"
|
strbuf += "\n"
|
||||||
# else:
|
# else:
|
||||||
# strbuf += " "
|
# strbuf += " "
|
||||||
scene["content"] = strbuf
|
scene["content"] = strbuf
|
||||||
sub.append(scene)
|
sub.append(scene)
|
||||||
|
|
||||||
|
|
||||||
with open("example.vtt", "r") as f:
|
with open("example.vtt", "r") as f:
|
||||||
vtt_content = f.read()
|
vtt_content = f.read()
|
||||||
|
|
||||||
parsed_vtt = from_vtt(vtt_content)
|
parsed_vtt = from_vtt(vtt_content)
|
||||||
print(to_vtt(stack_subtitle(parsed_vtt)))
|
print(to_vtt(stack_subtitle(parsed_vtt)))
|
||||||
|
|
133
subedit.py
133
subedit.py
|
@ -1,79 +1,88 @@
|
||||||
import json
|
import json
|
||||||
import os, sys
|
import os, sys
|
||||||
|
|
||||||
def readFile(file):
|
|
||||||
if not os.path.exists(file):
|
|
||||||
raise Exception(f"File {file} doesn't exists.")
|
|
||||||
with open(file, "r") as f:
|
|
||||||
data = f.read()
|
|
||||||
return data
|
|
||||||
|
|
||||||
def writeFile(file, data, overwrite = False):
|
def readFile(file):
|
||||||
if (not overwrite) and os.path.exists(file):
|
if not os.path.exists(file):
|
||||||
raise Exception(f"File {file} already exists.")
|
raise Exception(f"File {file} doesn't exists.")
|
||||||
if not len(data):
|
with open(file, "r") as f:
|
||||||
raise Exception(f"Tried to write empty data.")
|
data = f.read()
|
||||||
with open(file, "w") as f:
|
return data
|
||||||
ret = f.write(data)
|
|
||||||
return ret
|
|
||||||
|
def writeFile(file, data, overwrite=False):
|
||||||
|
if (not overwrite) and os.path.exists(file):
|
||||||
|
raise Exception(f"File {file} already exists.")
|
||||||
|
if not len(data):
|
||||||
|
raise Exception(f"Tried to write empty data.")
|
||||||
|
with open(file, "w") as f:
|
||||||
|
ret = f.write(data)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
file = sys.argv[1]
|
file = sys.argv[1]
|
||||||
|
|
||||||
if ".json" in file:
|
if ".json" in file:
|
||||||
subtitles = json.loads(readFile(file))
|
subtitles = json.loads(readFile(file))
|
||||||
output = ""
|
output = ""
|
||||||
index = 0
|
index = 0
|
||||||
for subtitle in subtitles:
|
for subtitle in subtitles:
|
||||||
if subtitle.get("split", False):
|
if subtitle.get("split", False):
|
||||||
output += "\n"
|
output += "\n"
|
||||||
else:
|
else:
|
||||||
index += 1
|
index += 1
|
||||||
start = subtitle["start"]
|
start = subtitle["start"]
|
||||||
end = subtitle["end"]
|
end = subtitle["end"]
|
||||||
content = subtitle["content"]
|
content = subtitle["content"]
|
||||||
"| {start:>10} --> {end:>10} |"
|
"| {start:>10} --> {end:>10} |"
|
||||||
output += f"{index:03} | {content.strip()}\n"
|
output += f"{index:03} | {content.strip()}\n"
|
||||||
|
|
||||||
output += "############ TIMESTAMPS ############\n\n"
|
output += "############ TIMESTAMPS ############\n\n"
|
||||||
|
|
||||||
index = 0
|
index = 0
|
||||||
for subtitle in subtitles:
|
for subtitle in subtitles:
|
||||||
if not subtitle.get("split", False):
|
if not subtitle.get("split", False):
|
||||||
index += 1
|
index += 1
|
||||||
start = subtitle["start"]
|
start = subtitle["start"]
|
||||||
end = subtitle["end"]
|
end = subtitle["end"]
|
||||||
output += f"{index:03} | {start} --> {end} \n"
|
output += f"{index:03} | {start} --> {end} \n"
|
||||||
|
|
||||||
writeFile(os.path.splitext(file)[0]+".edit", output)
|
writeFile(os.path.splitext(file)[0] + ".edit", output)
|
||||||
|
|
||||||
elif ".edit" in file:
|
elif ".edit" in file:
|
||||||
subtitles = json.loads(readFile(os.path.splitext(file)[0]+".json"))
|
subtitles = json.loads(readFile(os.path.splitext(file)[0] + ".json"))
|
||||||
lines = readFile(file)
|
lines = readFile(file)
|
||||||
|
|
||||||
idx, sub = 0, {}
|
idx, sub = 0, {}
|
||||||
for subtitle in subtitles:
|
for subtitle in subtitles:
|
||||||
if not subtitle.get("split", False):
|
if not subtitle.get("split", False):
|
||||||
sub[idx] = subtitle
|
sub[idx] = subtitle
|
||||||
idx += 1
|
idx += 1
|
||||||
|
|
||||||
new_brk, new_sub = [], {}
|
new_brk, new_sub = [], {}
|
||||||
for line in lines.split("\n"):
|
for line in lines.split("\n"):
|
||||||
if "\n############ TIMESTAMPS ############" == line:
|
if "\n############ TIMESTAMPS ############" == line:
|
||||||
break
|
break
|
||||||
if line:
|
if line:
|
||||||
idx, content = line.split(" | ")
|
idx, content = line.split(" | ")
|
||||||
idx = int(idx) - 1
|
idx = int(idx) - 1
|
||||||
if sub[idx]["content"] != content:
|
if sub[idx]["content"] != content:
|
||||||
print(f"{idx} {sub[idx]["content"]} -> {content}")
|
print(f"{idx} {sub[idx]["content"]} -> {content}")
|
||||||
new_sub[idx] = {"content": content, "start": sub[idx]["start"], "end": sub[idx]["end"]}
|
new_sub[idx] = {
|
||||||
else:
|
"content": content,
|
||||||
new_brk.append(idx)
|
"start": sub[idx]["start"],
|
||||||
|
"end": sub[idx]["end"],
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
new_brk.append(idx)
|
||||||
|
|
||||||
output = []
|
output = []
|
||||||
for n in sorted(new_sub):
|
for n in sorted(new_sub):
|
||||||
subtitle = new_sub[n]
|
subtitle = new_sub[n]
|
||||||
output.append(subtitle)
|
output.append(subtitle)
|
||||||
if n in new_brk:
|
if n in new_brk:
|
||||||
output.append({"content": "Break", "start": None, "end": None, "split": True})
|
output.append(
|
||||||
|
{"content": "Break", "start": None, "end": None, "split": True}
|
||||||
|
)
|
||||||
|
|
||||||
writeFile(os.path.splitext(file)[0]+".json.1", json.dumps(output, indent=2))
|
writeFile(os.path.splitext(file)[0] + ".json.1", json.dumps(output, indent=2))
|
||||||
|
|
732
vttmaker.py
732
vttmaker.py
File diff suppressed because it is too large
Load Diff
703
wordvtt.py
703
wordvtt.py
|
@ -4,50 +4,64 @@ import re, json
|
||||||
import os
|
import os
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
|
||||||
|
|
||||||
def from_vtt(vtt_string):
|
def from_vtt(vtt_string):
|
||||||
VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
|
VTT_TIMECODE_PATTERN = (
|
||||||
VTT_LINE_NUMBER_PATTERN = r"^\d+$"
|
r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
|
||||||
parts = re.split(r'\n\n+', vtt_string.strip())
|
)
|
||||||
if parts[0].startswith('WEBVTT'):
|
VTT_LINE_NUMBER_PATTERN = r"^\d+$"
|
||||||
parts.pop(0)
|
parts = re.split(r"\n\n+", vtt_string.strip())
|
||||||
|
if parts[0].startswith("WEBVTT"):
|
||||||
|
parts.pop(0)
|
||||||
|
|
||||||
subtitles = []
|
subtitles = []
|
||||||
for part in parts:
|
for part in parts:
|
||||||
lines = part.split('\n')
|
lines = part.split("\n")
|
||||||
match = re.match(VTT_TIMECODE_PATTERN, lines[0])
|
match = re.match(VTT_TIMECODE_PATTERN, lines[0])
|
||||||
if not match:
|
if not match:
|
||||||
if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]):
|
if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]):
|
||||||
lines.pop(0)
|
lines.pop(0)
|
||||||
match = re.match(VTT_TIMECODE_PATTERN, lines[0])
|
match = re.match(VTT_TIMECODE_PATTERN, lines[0])
|
||||||
if not match:
|
if not match:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
start, end = match.groups()
|
start, end = match.groups()
|
||||||
content = '\n'.join(lines[1:]) + "\n"
|
content = "\n".join(lines[1:]) + "\n"
|
||||||
subtitles.append({
|
subtitles.append(
|
||||||
'start': start,
|
{
|
||||||
'end': end,
|
"start": start,
|
||||||
'content': (content.replace("-\n", "\n").replace("</u>-\n", "</u>\n").replace("-", " ").replace("%", " ").replace("<u> "," <u>").replace(" </u>","</u> ").replace("<u> </u>","").replace("<u></u>","").replace(" \n", "\n"))[:-1]
|
"end": end,
|
||||||
})
|
"content": (
|
||||||
# def sanitizevttwordlevel(subtitles):
|
content.replace("-\n", "\n")
|
||||||
# errorwords = []
|
.replace("</u>-\n", "</u>\n")
|
||||||
# newords = {}
|
.replace("-", " ")
|
||||||
# for subtitle in subtitles:
|
.replace("%", " ")
|
||||||
# for word in subtitle["content"].split(" "):
|
.replace("<u> ", " <u>")
|
||||||
# if ("<u>" in word):
|
.replace(" </u>", "</u> ")
|
||||||
# newword = None
|
.replace("<u> </u>", "")
|
||||||
# if (len(word.split("<u>")) > 1):
|
.replace("<u></u>", "")
|
||||||
# newword = word.replace("<u>", " <u>")
|
.replace(" \n", "\n")
|
||||||
# if (len(word.split("</u>")) > 1):
|
)[:-1],
|
||||||
# newword = word.replace("</u>", "</u> ")
|
}
|
||||||
# if newword:
|
)
|
||||||
# original = word.split("<u>")[1].split("</u>")[0]
|
# def sanitizevttwordlevel(subtitles):
|
||||||
# if original in errorwords:
|
# errorwords = []
|
||||||
# for i in errorwords[original]:
|
# newords = {}
|
||||||
|
# for subtitle in subtitles:
|
||||||
|
# for word in subtitle["content"].split(" "):
|
||||||
|
# if ("<u>" in word):
|
||||||
|
# newword = None
|
||||||
|
# if (len(word.split("<u>")) > 1):
|
||||||
|
# newword = word.replace("<u>", " <u>")
|
||||||
|
# if (len(word.split("</u>")) > 1):
|
||||||
|
# newword = word.replace("</u>", "</u> ")
|
||||||
|
# if newword:
|
||||||
|
# original = word.split("<u>")[1].split("</u>")[0]
|
||||||
|
# if original in errorwords:
|
||||||
|
# for i in errorwords[original]:
|
||||||
|
|
||||||
|
# else:
|
||||||
# else:
|
# errorwords[orig].append(word)
|
||||||
# errorwords[orig].append(word)
|
|
||||||
|
|
||||||
# error = False
|
# error = False
|
||||||
# if "<u>" in subtitle["content"]:
|
# if "<u>" in subtitle["content"]:
|
||||||
|
@ -63,347 +77,386 @@ def from_vtt(vtt_string):
|
||||||
# for word in subtitle["content"].split(" "):
|
# for word in subtitle["content"].split(" "):
|
||||||
# if word.replace("<u>")
|
# if word.replace("<u>")
|
||||||
|
|
||||||
# for subtitle in subtitles:
|
# for subtitle in subtitles:
|
||||||
# for words in subtitle["content"].split(" "):
|
# for words in subtitle["content"].split(" "):
|
||||||
# if word in errorwords:
|
# if word in errorwords:
|
||||||
# subtitle["content"]
|
# subtitle["content"]
|
||||||
|
|
||||||
|
# sanitizevttwordlevel(subtitles)
|
||||||
|
return subtitles
|
||||||
|
|
||||||
# sanitizevttwordlevel(subtitles)
|
|
||||||
return subtitles
|
|
||||||
|
|
||||||
def to_vtt(subtitles):
|
def to_vtt(subtitles):
|
||||||
vtt_content = "WEBVTT\n\n\n"
|
vtt_content = "WEBVTT\n\n\n"
|
||||||
for idx, subtitle in enumerate(subtitles):
|
for idx, subtitle in enumerate(subtitles):
|
||||||
content = subtitle['content']
|
content = subtitle["content"]
|
||||||
if not subtitle.get("split", False):
|
if not subtitle.get("split", False):
|
||||||
start = subtitle['start']
|
start = subtitle["start"]
|
||||||
end = subtitle['end']
|
end = subtitle["end"]
|
||||||
vtt_content += f"{start} --> {end}\n{content}\n\n\n"
|
vtt_content += f"{start} --> {end}\n{content}\n\n\n"
|
||||||
else:
|
else:
|
||||||
vtt_content += f"NOTE {content}\n\n\n"
|
vtt_content += f"NOTE {content}\n\n\n"
|
||||||
|
|
||||||
return vtt_content.strip()
|
return vtt_content.strip()
|
||||||
|
|
||||||
|
|
||||||
def to_stacked_vtt(subtitles):
|
def to_stacked_vtt(subtitles):
|
||||||
vtt_content = "WEBVTT\n\n\n"
|
vtt_content = "WEBVTT\n\n\n"
|
||||||
buffer = ""
|
buffer = ""
|
||||||
for subtitle in subtitles:
|
for subtitle in subtitles:
|
||||||
if subtitle.get("split", False):
|
if subtitle.get("split", False):
|
||||||
buffer = ""
|
buffer = ""
|
||||||
continue
|
continue
|
||||||
if len(buffer) != 0:
|
if len(buffer) != 0:
|
||||||
if str(subtitle['content'].strip())[-1] == ".":
|
if str(subtitle["content"].strip())[-1] == ".":
|
||||||
buffer += "\n"
|
buffer += "\n"
|
||||||
else:
|
else:
|
||||||
buffer += " "
|
buffer += " "
|
||||||
buffer += subtitle['content'].strip()
|
buffer += subtitle["content"].strip()
|
||||||
vtt_content += f"{subtitle['start']} --> {subtitle['end']}\n"
|
vtt_content += f"{subtitle['start']} --> {subtitle['end']}\n"
|
||||||
vtt_content += buffer
|
vtt_content += buffer
|
||||||
vtt_content += "\n\n\n"
|
vtt_content += "\n\n\n"
|
||||||
return vtt_content
|
return vtt_content
|
||||||
|
|
||||||
|
|
||||||
def script_from_word_vtt(wordvtt):
|
def script_from_word_vtt(wordvtt):
|
||||||
subtitles = from_vtt(wordvtt)
|
subtitles = from_vtt(wordvtt)
|
||||||
print(f"VTT {len(subtitles)} lines. Generating script file from VTT.")
|
print(f"VTT {len(subtitles)} lines. Generating script file from VTT.")
|
||||||
sentences = []
|
sentences = []
|
||||||
EXCEPTION_FLAG, ADD_NEXT_SENTENCE = "", 0
|
EXCEPTION_FLAG, ADD_NEXT_SENTENCE = "", 0
|
||||||
for n, subtitle in enumerate(subtitles):
|
for n, subtitle in enumerate(subtitles):
|
||||||
sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
|
sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
|
||||||
if ((sentences[-1] if sentences else None) != sentence) or ADD_NEXT_SENTENCE:
|
if ((sentences[-1] if sentences else None) != sentence) or ADD_NEXT_SENTENCE:
|
||||||
sentences.append(sentence)
|
sentences.append(sentence)
|
||||||
ADD_NEXT_SENTENCE = 0
|
ADD_NEXT_SENTENCE = 0
|
||||||
if subtitle["content"][-4:] == "</u>":
|
if subtitle["content"][-4:] == "</u>":
|
||||||
# print(f"{len(sentences)} END {subtitle["content"]}")
|
# print(f"{len(sentences)} END {subtitle["content"]}")
|
||||||
ADD_NEXT_SENTENCE = 1
|
ADD_NEXT_SENTENCE = 1
|
||||||
if n + 2 < len(subtitles):
|
if n + 2 < len(subtitles):
|
||||||
if subtitles[n+2]["content"].replace("<u>", "").replace("</u>", "") != sentence:
|
if (
|
||||||
ADD_NEXT_SENTENCE = 0
|
subtitles[n + 2]["content"].replace("<u>", "").replace("</u>", "")
|
||||||
return sentences
|
!= sentence
|
||||||
|
):
|
||||||
|
ADD_NEXT_SENTENCE = 0
|
||||||
|
return sentences
|
||||||
|
|
||||||
|
|
||||||
def create_word_scenes(wordvtt, scriptraw):
|
def create_word_scenes(wordvtt, scriptraw):
|
||||||
subtitles = from_vtt(wordvtt)
|
subtitles = from_vtt(wordvtt)
|
||||||
scripts = [i for i in scriptraw.split("\n") if i]
|
scripts = [i for i in scriptraw.split("\n") if i]
|
||||||
print(f"VTT {len(subtitles)} lines, Script {len(scripts)} lines")
|
print(f"VTT {len(subtitles)} lines, Script {len(scripts)} lines")
|
||||||
scenes = []
|
scenes = []
|
||||||
for n, script in enumerate(scripts):
|
for n, script in enumerate(scripts):
|
||||||
if len(script.split(" ")) == 1:
|
if len(script.split(" ")) == 1:
|
||||||
continue
|
continue
|
||||||
scenes.append({"scene": script, "timestamp": []})
|
scenes.append({"scene": script, "timestamp": []})
|
||||||
|
|
||||||
scenes_cur = 0
|
scenes_cur = 0
|
||||||
for n, subtitle in enumerate(subtitles):
|
for n, subtitle in enumerate(subtitles):
|
||||||
sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
|
sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
|
||||||
if len(sentence.split(" ")) == 1:
|
if len(sentence.split(" ")) == 1:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if sentence != scenes[scenes_cur].get("scene"):
|
if sentence != scenes[scenes_cur].get("scene"):
|
||||||
if sentence == scenes[scenes_cur+1].get("scene"):
|
if sentence == scenes[scenes_cur + 1].get("scene"):
|
||||||
scenes_cur += 1
|
scenes_cur += 1
|
||||||
else:
|
else:
|
||||||
print(f"Error, Mismatch in scenes\n=>\"[{scenes_cur}] {scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\"")
|
print(
|
||||||
return
|
f"Error, Mismatch in scenes\n=>\"[{scenes_cur}] {scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\""
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
current_scene = scenes[scenes_cur]
|
current_scene = scenes[scenes_cur]
|
||||||
if current_scene["timestamp"]:
|
if current_scene["timestamp"]:
|
||||||
word_idx = current_scene["timestamp"][-1]["index"] + 1
|
word_idx = current_scene["timestamp"][-1]["index"] + 1
|
||||||
else:
|
else:
|
||||||
word_idx = 0
|
word_idx = 0
|
||||||
|
|
||||||
# print(scenes_cur, subtitle, word_idx, sentence)
|
# print(scenes_cur, subtitle, word_idx, sentence)
|
||||||
if ("<u>" not in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
|
if ("<u>" not in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
|
||||||
pass
|
pass
|
||||||
if ("<u>" in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
|
if ("<u>" in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
|
||||||
print(f"Error, index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}")
|
print(
|
||||||
word_idx = 0
|
f"Error, index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}"
|
||||||
scenes_cur += 1
|
)
|
||||||
current_scene = scenes[scenes_cur]
|
word_idx = 0
|
||||||
if current_scene["timestamp"]:
|
scenes_cur += 1
|
||||||
word_idx = current_scene["timestamp"][-1]["index"] + 1
|
current_scene = scenes[scenes_cur]
|
||||||
else:
|
if current_scene["timestamp"]:
|
||||||
word_idx = 0
|
word_idx = current_scene["timestamp"][-1]["index"] + 1
|
||||||
print(f"Changed to {word_idx}, {scenes_cur}")
|
else:
|
||||||
|
word_idx = 0
|
||||||
|
print(f"Changed to {word_idx}, {scenes_cur}")
|
||||||
|
|
||||||
if "<u>" in subtitle["content"]:
|
if "<u>" in subtitle["content"]:
|
||||||
# print(subtitle["content"])
|
# print(subtitle["content"])
|
||||||
word = subtitle["content"].split("<u>")[1].split("</u>")[0]
|
word = subtitle["content"].split("<u>")[1].split("</u>")[0]
|
||||||
|
|
||||||
if word not in sentence.split(" "):
|
if word not in sentence.split(" "):
|
||||||
print(f"Error, Mismatch\n=> \"{word}\" not in \"{sentence}\"")
|
print(f'Error, Mismatch\n=> "{word}" not in "{sentence}"')
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
assert sentence.split(" ")[word_idx] == word
|
assert sentence.split(" ")[word_idx] == word
|
||||||
except:
|
except:
|
||||||
print(f"Error, Mismatch\n=> \"{word}\" != [{word_idx}] of \"{sentence}\"")
|
print(f'Error, Mismatch\n=> "{word}" != [{word_idx}] of "{sentence}"')
|
||||||
return
|
return
|
||||||
|
|
||||||
word_time = {"start": subtitle["start"], "end": subtitle["end"], "index": word_idx, "word": word}
|
word_time = {
|
||||||
current_scene["timestamp"].append(word_time)
|
"start": subtitle["start"],
|
||||||
|
"end": subtitle["end"],
|
||||||
|
"index": word_idx,
|
||||||
|
"word": word,
|
||||||
|
}
|
||||||
|
current_scene["timestamp"].append(word_time)
|
||||||
|
|
||||||
# print(json.dumps(scenes, indent=2))
|
# print(json.dumps(scenes, indent=2))
|
||||||
|
|
||||||
for scene in scenes:
|
for scene in scenes:
|
||||||
if len(scene["scene"].split(" ")) != len(scene["timestamp"]):
|
if len(scene["scene"].split(" ")) != len(scene["timestamp"]):
|
||||||
print("Error, Mismatch length")
|
print("Error, Mismatch length")
|
||||||
return
|
return
|
||||||
if "" in scene["scene"].split(" "):
|
if "" in scene["scene"].split(" "):
|
||||||
print(repr(scene["scene"]))
|
print(repr(scene["scene"]))
|
||||||
|
|
||||||
full_script, full_scenes = [], []
|
full_script, full_scenes = [], []
|
||||||
for scene in scenes:
|
for scene in scenes:
|
||||||
full_script += scene["scene"].split(" ")[:-1]
|
full_script += scene["scene"].split(" ")[:-1]
|
||||||
full_script.append(scene["scene"].split(" ")[-1]+"##")
|
full_script.append(scene["scene"].split(" ")[-1] + "##")
|
||||||
full_scenes += scene["timestamp"]
|
full_scenes += scene["timestamp"]
|
||||||
|
|
||||||
for i, j in zip(full_script, full_scenes):
|
for i, j in zip(full_script, full_scenes):
|
||||||
if i.replace("##", "") != j["word"]:
|
if i.replace("##", "") != j["word"]:
|
||||||
print("Error, Mismatch")
|
print("Error, Mismatch")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
assert len(full_scenes) == len(full_script)
|
||||||
|
|
||||||
|
return full_script, full_scenes
|
||||||
|
|
||||||
|
|
||||||
assert len(full_scenes) == len(full_script)
|
|
||||||
|
|
||||||
return full_script, full_scenes
|
|
||||||
|
|
||||||
# Detect long break or change in context, inserts section break into script.
|
# Detect long break or change in context, inserts section break into script.
|
||||||
def autobreak(lines, times):
|
def autobreak(lines, times):
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
|
||||||
def parsetime(time_str):
|
def parsetime(time_str):
|
||||||
minutes, seconds = time_str.split(':')
|
minutes, seconds = time_str.split(":")
|
||||||
seconds, milliseconds = seconds.split('.')
|
seconds, milliseconds = seconds.split(".")
|
||||||
td = timedelta(minutes=int(minutes), seconds=int(seconds), milliseconds=int(milliseconds))
|
td = timedelta(
|
||||||
return td
|
minutes=int(minutes), seconds=int(seconds), milliseconds=int(milliseconds)
|
||||||
|
)
|
||||||
|
return td
|
||||||
|
|
||||||
script = []
|
script = []
|
||||||
long_breaks = []
|
long_breaks = []
|
||||||
tmark = parsetime("0:0.0")
|
tmark = parsetime("0:0.0")
|
||||||
for i, j in zip(lines, times):
|
for i, j in zip(lines, times):
|
||||||
tdiff = parsetime(j["start"]) - tmark
|
tdiff = parsetime(j["start"]) - tmark
|
||||||
tmark = parsetime(j["end"])
|
tmark = parsetime(j["end"])
|
||||||
if tdiff > parsetime("0:0.0"):
|
if tdiff > parsetime("0:0.0"):
|
||||||
long_breaks.append(tdiff)
|
long_breaks.append(tdiff)
|
||||||
# print()
|
# print()
|
||||||
# print(i, end=" ")
|
# print(i, end=" ")
|
||||||
# print()
|
# print()
|
||||||
|
|
||||||
mean_break = parsetime("0:0.0")
|
mean_break = parsetime("0:0.0")
|
||||||
for i in long_breaks:
|
for i in long_breaks:
|
||||||
mean_break += i/len(long_breaks)
|
mean_break += i / len(long_breaks)
|
||||||
print(mean_break)
|
print(mean_break)
|
||||||
|
|
||||||
script = ""
|
script = ""
|
||||||
tmark = parsetime("0:0.0")
|
tmark = parsetime("0:0.0")
|
||||||
tmp = " "
|
tmp = " "
|
||||||
|
|
||||||
continous_line = 0
|
continous_line = 0
|
||||||
for i, j in zip(lines, times):
|
for i, j in zip(lines, times):
|
||||||
tdiff = parsetime(j["start"]) - tmark
|
tdiff = parsetime(j["start"]) - tmark
|
||||||
tmark = parsetime(j["end"])
|
tmark = parsetime(j["end"])
|
||||||
if tdiff > mean_break and tmp[-1] != ".":
|
if tdiff > mean_break and tmp[-1] != ".":
|
||||||
script += "\n"
|
script += "\n"
|
||||||
|
|
||||||
if (tdiff >= mean_break and tmp[-1] == "."):
|
if tdiff >= mean_break and tmp[-1] == ".":
|
||||||
script += "\n"
|
script += "\n"
|
||||||
continous_line = 0
|
continous_line = 0
|
||||||
else:
|
else:
|
||||||
continous_line += 1
|
continous_line += 1
|
||||||
|
|
||||||
script += i.replace("##", "")
|
script += i.replace("##", "")
|
||||||
|
|
||||||
|
if i[-1] == ".":
|
||||||
|
script += "\n"
|
||||||
|
elif "##" in i:
|
||||||
|
script += "\n"
|
||||||
|
else:
|
||||||
|
script += " "
|
||||||
|
tmp = i
|
||||||
|
|
||||||
|
return script
|
||||||
|
|
||||||
if i[-1] == ".":
|
|
||||||
script += "\n"
|
|
||||||
elif "##" in i:
|
|
||||||
script += "\n"
|
|
||||||
else:
|
|
||||||
script += " "
|
|
||||||
tmp = i
|
|
||||||
|
|
||||||
return script
|
|
||||||
|
|
||||||
def scene_from_new_script(raw_script, full_script, full_scenes):
|
def scene_from_new_script(raw_script, full_script, full_scenes):
|
||||||
mod_script = raw_script.replace("\n", " \n ").split(" ")
|
mod_script = raw_script.replace("\n", " \n ").split(" ")
|
||||||
mod_script = [i for i in mod_script if i]
|
mod_script = [i for i in mod_script if i]
|
||||||
n = 0
|
n = 0
|
||||||
while True:
|
while True:
|
||||||
if mod_script[n] == "\n":
|
if mod_script[n] == "\n":
|
||||||
mod_script[n-1] += "\n"
|
mod_script[n - 1] += "\n"
|
||||||
del(mod_script[n])
|
del mod_script[n]
|
||||||
n -= 1
|
n -= 1
|
||||||
n += 1
|
n += 1
|
||||||
if n == len(mod_script):
|
if n == len(mod_script):
|
||||||
break
|
break
|
||||||
# print(mod_script)
|
# print(mod_script)
|
||||||
print(f"Original: {len(full_script)}, Modded: {len(mod_script)}")
|
print(f"Original: {len(full_script)}, Modded: {len(mod_script)}")
|
||||||
allowed_list = [".", "\n", "\n\n", ",", "?", "##"]
|
allowed_list = [".", "\n", "\n\n", ",", "?", "##"]
|
||||||
|
|
||||||
|
def normalized(x):
|
||||||
|
for i in allowed_list:
|
||||||
|
x = x.replace(i, "")
|
||||||
|
return x.upper()
|
||||||
|
|
||||||
|
same = lambda a, b: normalized(a) == normalized(b)
|
||||||
|
new_script, new_timestamp, orig_index, n = [], [], 0, 0
|
||||||
|
fail = 0
|
||||||
|
while n < len(mod_script):
|
||||||
|
print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}")
|
||||||
|
word = mod_script[n]
|
||||||
|
if same(word, full_script[orig_index].replace("##", "")):
|
||||||
|
cur = full_scenes[orig_index]
|
||||||
|
new_script.append(word.replace("##", ""))
|
||||||
|
new_timestamp.append({"start": cur["start"], "end": cur["end"]})
|
||||||
|
fail = 0
|
||||||
|
else:
|
||||||
|
if fail > 10:
|
||||||
|
print("Error: Failed to match words,")
|
||||||
|
return
|
||||||
|
# print("Back")
|
||||||
|
fail += 1
|
||||||
|
n -= 1
|
||||||
|
n, orig_index = n + 1, orig_index + 1
|
||||||
|
assert len(new_script) == len(new_timestamp)
|
||||||
|
return new_script, new_timestamp
|
||||||
|
|
||||||
def normalized(x):
|
|
||||||
for i in allowed_list:
|
|
||||||
x = x.replace(i, "")
|
|
||||||
return x.upper()
|
|
||||||
|
|
||||||
same = lambda a, b: normalized(a) == normalized(b)
|
|
||||||
new_script, new_timestamp, orig_index, n = [], [], 0, 0
|
|
||||||
fail = 0
|
|
||||||
while n < len(mod_script):
|
|
||||||
print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}")
|
|
||||||
word = mod_script[n]
|
|
||||||
if same(word, full_script[orig_index].replace("##", "")):
|
|
||||||
cur = full_scenes[orig_index]
|
|
||||||
new_script.append(word.replace("##", ""))
|
|
||||||
new_timestamp.append({"start": cur["start"], "end": cur["end"]})
|
|
||||||
fail = 0
|
|
||||||
else:
|
|
||||||
if fail > 10:
|
|
||||||
print("Error: Failed to match words,")
|
|
||||||
return
|
|
||||||
# print("Back")
|
|
||||||
fail += 1
|
|
||||||
n -= 1
|
|
||||||
n, orig_index = n+1, orig_index+1
|
|
||||||
assert len(new_script) == len(new_timestamp)
|
|
||||||
return new_script, new_timestamp
|
|
||||||
|
|
||||||
def build_new_subtitle(new_script, new_timestamp):
|
def build_new_subtitle(new_script, new_timestamp):
|
||||||
buffer, new_scenes, start, end = [], [], None, None
|
buffer, new_scenes, start, end = [], [], None, None
|
||||||
current_scene = []
|
current_scene = []
|
||||||
# print(" ".join(new_script).split("\n"))
|
# print(" ".join(new_script).split("\n"))
|
||||||
|
|
||||||
for i, j in zip(new_script, new_timestamp):
|
for i, j in zip(new_script, new_timestamp):
|
||||||
if "\n" in i:
|
if "\n" in i:
|
||||||
buffer.append(i.replace("\n", ""))
|
buffer.append(i.replace("\n", ""))
|
||||||
current_scene.append({"content": " ".join(buffer).replace("##", ""), "start": start, "end": j["end"]})
|
current_scene.append(
|
||||||
buffer, start = [], None
|
{
|
||||||
if "\n\n" in i:
|
"content": " ".join(buffer).replace("##", ""),
|
||||||
print(f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"")
|
"start": start,
|
||||||
|
"end": j["end"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
buffer, start = [], None
|
||||||
|
if "\n\n" in i:
|
||||||
|
print(
|
||||||
|
f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\""
|
||||||
|
)
|
||||||
|
new_scenes.append(current_scene)
|
||||||
|
current_scene = []
|
||||||
|
else:
|
||||||
|
buffer.append(i)
|
||||||
|
if not start:
|
||||||
|
start = j["start"]
|
||||||
|
|
||||||
|
if start:
|
||||||
|
buffer.append(i.replace("\n", ""))
|
||||||
|
current_scene.append(
|
||||||
|
{"content": " ".join(buffer), "start": start, "end": j["end"]}
|
||||||
|
)
|
||||||
|
|
||||||
|
if current_scene != (new_scenes[-1] if new_scenes else None):
|
||||||
new_scenes.append(current_scene)
|
new_scenes.append(current_scene)
|
||||||
current_scene = []
|
|
||||||
else:
|
|
||||||
buffer.append(i)
|
|
||||||
if not start:
|
|
||||||
start = j["start"]
|
|
||||||
|
|
||||||
if start:
|
# print("\n\n".join(["\n".join([j["content"] for j in i]) for i in new_scenes]))
|
||||||
buffer.append(i.replace("\n", ""))
|
newsub = []
|
||||||
current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]})
|
for n, i in enumerate(new_scenes):
|
||||||
|
newsub += i
|
||||||
|
if n < len(new_scenes) - 1:
|
||||||
|
newsub.append(
|
||||||
|
{"content": "Break", "start": None, "end": None, "split": True}
|
||||||
|
)
|
||||||
|
|
||||||
if current_scene != (new_scenes[-1] if new_scenes else None):
|
return newsub
|
||||||
new_scenes.append(current_scene)
|
|
||||||
|
|
||||||
# print("\n\n".join(["\n".join([j["content"] for j in i]) for i in new_scenes]))
|
|
||||||
newsub = []
|
|
||||||
for n, i in enumerate(new_scenes):
|
|
||||||
newsub += i
|
|
||||||
if n < len(new_scenes) - 1:
|
|
||||||
newsub.append({"content": "Break", "start": None, "end": None, "split": True})
|
|
||||||
|
|
||||||
return newsub
|
def saveFile(filename, data, override=False):
|
||||||
|
if os.path.exists(filename) and not override:
|
||||||
|
print(f"File {filename} already exists.")
|
||||||
|
return -1
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
f.write(data)
|
||||||
|
|
||||||
def saveFile(filename, data, override = False):
|
|
||||||
if os.path.exists(filename) and not override:
|
|
||||||
print(f"File {filename} already exists.")
|
|
||||||
return -1
|
|
||||||
with open(filename, "w") as f:
|
|
||||||
f.write(data)
|
|
||||||
|
|
||||||
def openFile(filename):
|
def openFile(filename):
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
data = f.read()
|
data = f.read()
|
||||||
if not data:
|
if not data:
|
||||||
return -1
|
return -1
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def main(vttfile, scriptfile):
|
def main(vttfile, scriptfile):
|
||||||
modfile = ".".join(scriptfile.split(".")[:-1]) + ".script"
|
modfile = ".".join(scriptfile.split(".")[:-1]) + ".script"
|
||||||
x = create_word_scenes(openFile(vttfile), openFile(scriptfile))
|
x = create_word_scenes(openFile(vttfile), openFile(scriptfile))
|
||||||
if not x:
|
|
||||||
sys.exit(-1)
|
|
||||||
full_script, full_scenes = x
|
|
||||||
|
|
||||||
if not os.path.exists(modfile):
|
|
||||||
genscript = autobreak(full_script,full_scenes)
|
|
||||||
saveFile(modfile, genscript)
|
|
||||||
print(f"Saved modification file as {modfile}. Modify it and return back.")
|
|
||||||
else:
|
|
||||||
x = scene_from_new_script(openFile(modfile), full_script, full_scenes)
|
|
||||||
if not x:
|
if not x:
|
||||||
sys.exit(-1)
|
sys.exit(-1)
|
||||||
a, b = x
|
full_script, full_scenes = x
|
||||||
|
|
||||||
final_vtt = build_new_subtitle(a, b)
|
if not os.path.exists(modfile):
|
||||||
jsonfile = ".".join(vttfile.split(".")[:-1]) + ".json"
|
genscript = autobreak(full_script, full_scenes)
|
||||||
saveFile(jsonfile, json.dumps(final_vtt, indent=2), True)
|
saveFile(modfile, genscript)
|
||||||
print(f"Saved JSON file as {jsonfile}. Fix it, and convert it to VTT.")
|
print(f"Saved modification file as {modfile}. Modify it and return back.")
|
||||||
|
else:
|
||||||
|
x = scene_from_new_script(openFile(modfile), full_script, full_scenes)
|
||||||
|
if not x:
|
||||||
|
sys.exit(-1)
|
||||||
|
a, b = x
|
||||||
|
|
||||||
if __name__=="__main__":
|
final_vtt = build_new_subtitle(a, b)
|
||||||
import sys
|
jsonfile = ".".join(vttfile.split(".")[:-1]) + ".json"
|
||||||
if len(sys.argv) not in (2, 3):
|
saveFile(jsonfile, json.dumps(final_vtt, indent=2), True)
|
||||||
print(f"Usage: {sys.argv[0].split("/")[-1]} [vtt file] (txt file)\n" \
|
print(f"Saved JSON file as {jsonfile}. Fix it, and convert it to VTT.")
|
||||||
f" {sys.argv[0].split("/")[-1]} [JSON file]\n" \
|
|
||||||
"** Only output from openai-whisper with '--word-timestamp true' is accepted.)\n" \
|
|
||||||
"** You have to run this for first time, and then fix .script file, and then re-run this script.\n" \
|
|
||||||
"** Adding newline/period/commas are onlt permitted. Fix else in JSON file.")
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
vtt = sys.argv[1]
|
|
||||||
print(f"\n[{vtt}]")
|
if __name__ == "__main__":
|
||||||
if len(sys.argv) == 3:
|
import sys
|
||||||
script = sys.argv[2]
|
|
||||||
if (not os.path.exists(vtt)) or (not os.path.exists(script)):
|
if len(sys.argv) not in (2, 3):
|
||||||
print(f"Input file doesnt exists.")
|
print(
|
||||||
sys.exit(-1)
|
f"Usage: {sys.argv[0].split("/")[-1]} [vtt file] (txt file)\n"
|
||||||
main(vtt, script)
|
f" {sys.argv[0].split("/")[-1]} [JSON file]\n"
|
||||||
else:
|
"** Only output from openai-whisper with '--word-timestamp true' is accepted.)\n"
|
||||||
if ".json" in vtt:
|
"** You have to run this for first time, and then fix .script file, and then re-run this script.\n"
|
||||||
final_vtt = json.loads(openFile(vtt))
|
"** Adding newline/period/commas are onlt permitted. Fix else in JSON file."
|
||||||
orgf = ".".join(vtt.split(".")[:-1])
|
)
|
||||||
print(f"Saved VTT file as {orgf}.final.vtt.")
|
sys.exit()
|
||||||
saveFile(orgf + ".final.vtt", to_vtt(final_vtt), True)
|
|
||||||
saveFile(orgf + ".stacked.vtt", to_stacked_vtt(final_vtt), True)
|
vtt = sys.argv[1]
|
||||||
sys.exit(0)
|
print(f"\n[{vtt}]")
|
||||||
if (not os.path.exists(vtt)):
|
if len(sys.argv) == 3:
|
||||||
print(f"Input file doesnt exists.")
|
script = sys.argv[2]
|
||||||
sys.exit(-1)
|
if (not os.path.exists(vtt)) or (not os.path.exists(script)):
|
||||||
script = ".".join(vtt.split(".")[:-1]) + ".txt"
|
print(f"Input file doesnt exists.")
|
||||||
saveFile(script, "\n".join(script_from_word_vtt(openFile(vtt))))
|
sys.exit(-1)
|
||||||
main(vtt, script)
|
main(vtt, script)
|
||||||
|
else:
|
||||||
|
if ".json" in vtt:
|
||||||
|
final_vtt = json.loads(openFile(vtt))
|
||||||
|
orgf = ".".join(vtt.split(".")[:-1])
|
||||||
|
print(f"Saved VTT file as {orgf}.final.vtt.")
|
||||||
|
saveFile(orgf + ".final.vtt", to_vtt(final_vtt), True)
|
||||||
|
saveFile(orgf + ".stacked.vtt", to_stacked_vtt(final_vtt), True)
|
||||||
|
sys.exit(0)
|
||||||
|
if not os.path.exists(vtt):
|
||||||
|
print(f"Input file doesnt exists.")
|
||||||
|
sys.exit(-1)
|
||||||
|
script = ".".join(vtt.split(".")[:-1]) + ".txt"
|
||||||
|
saveFile(script, "\n".join(script_from_word_vtt(openFile(vtt))))
|
||||||
|
main(vtt, script)
|
||||||
|
|
Loading…
Reference in New Issue