Fix
This commit is contained in:
parent
040421dd08
commit
368c26d491
|
@ -0,0 +1 @@
|
|||
i="input.mp4";j="test.stacked.vtt";ffmpeg -f lavfi -i color=c=gray:s=508x1080:r=ntsc:d=$(ffprobe -i $i -show_entries format=duration -v quiet -of csv="p=0") -vf "subtitles=$j:force_style='FontName=Helvetica,Alignment=4,Fontsize=9.5,Outline=0,Shadow=0,MarginH=2,MarginV=4,Spacing=0'" -b:v 2000k -f nut - | ffmpeg -i $i -i - -filter_complex "[0:v][1:v]hstack=inputs=2:shortest=1[v];[0:a]anull[a2];[0:a][a2]amerge[a]" -map "[v]" -map "[a]" -b:v 2000k -f nut - | ffplay -
|
238
snusub.py
238
snusub.py
|
@ -4,6 +4,8 @@ import re, json
|
|||
import os, sys
|
||||
from datetime import timedelta
|
||||
|
||||
###
|
||||
|
||||
def from_vtt(vtt_string):
|
||||
VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
|
||||
VTT_LINE_NUMBER_PATTERN = r"^\d+$"
|
||||
|
@ -24,6 +26,9 @@ def from_vtt(vtt_string):
|
|||
|
||||
start, end = match.groups()
|
||||
content = '\n'.join(lines[1:]) + "\n"
|
||||
# if start == end:
|
||||
# continue
|
||||
|
||||
subtitles.append({
|
||||
'start': start,
|
||||
'end': end,
|
||||
|
@ -39,6 +44,8 @@ def to_vtt(subtitles):
|
|||
if not subtitle.get("split", False):
|
||||
start = subtitle['start']
|
||||
end = subtitle['end']
|
||||
if not start or not end or start == end:
|
||||
raise Exception(f"VTT timestamp parse error from #{idx}.")
|
||||
vtt_content += f"{start} --> {end}\n{content}\n\n\n"
|
||||
else:
|
||||
vtt_content += f"NOTE {content}\n\n\n"
|
||||
|
@ -58,6 +65,7 @@ def to_stacked_vtt(subtitles, continous = True):
|
|||
buffer += "\n"
|
||||
else:
|
||||
buffer += " "
|
||||
|
||||
buffer += subtitle['content'].strip()
|
||||
|
||||
if n < len(subtitles) - 1:
|
||||
|
@ -65,6 +73,10 @@ def to_stacked_vtt(subtitles, continous = True):
|
|||
else:
|
||||
end_time = subtitle['end']
|
||||
|
||||
if not subtitle['start'] or not end_time:
|
||||
raise Exception(f"VTT timestamp parse error from #{idx}.")
|
||||
if subtitle['start'] == end_time:
|
||||
raise Exception(f"Error, subtitle timestamp overlaps.\n{subtitle['start']} --> {end_time} {subtitle['content'].strip()}")
|
||||
vtt_content += f"{subtitle['start']} --> {end_time}\n"
|
||||
vtt_content += buffer
|
||||
vtt_content += "\n\n\n"
|
||||
|
@ -73,16 +85,18 @@ def to_stacked_vtt(subtitles, continous = True):
|
|||
|
||||
return vtt_content
|
||||
|
||||
###
|
||||
|
||||
def script_from_word_vtt(wordvtt):
|
||||
subtitles = from_vtt(wordvtt)
|
||||
print(f"VTT {len(subtitles)} lines. Generating script file from VTT.")
|
||||
print(f"Generating script file from VTT...")
|
||||
sentences = []
|
||||
EXCEPTION_FLAG, ADD_NEXT_SENTENCE = "", 0
|
||||
ADD_NEXT_SENTENCE = 0
|
||||
for n, subtitle in enumerate(subtitles):
|
||||
sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
|
||||
if ((sentences[-1] if sentences else None) != sentence) or ADD_NEXT_SENTENCE:
|
||||
sentences.append(sentence)
|
||||
ADD_NEXT_SENTENCE = 0
|
||||
ADD_NEXT_SENTENCE = 0
|
||||
if subtitle["content"][-4:] == "</u>":
|
||||
ADD_NEXT_SENTENCE = 1
|
||||
if n + 2 < len(subtitles):
|
||||
|
@ -90,10 +104,11 @@ def script_from_word_vtt(wordvtt):
|
|||
ADD_NEXT_SENTENCE = 0
|
||||
return sentences
|
||||
|
||||
def create_word_scenes(wordvtt, scriptraw):
|
||||
subtitles = from_vtt(wordvtt)
|
||||
scripts = [i for i in scriptraw.split("\n") if i]
|
||||
print(f"VTT {len(subtitles)} lines, Script {len(scripts)} lines")
|
||||
def create_word_scenes(raw_vtt, raw_script):
|
||||
subtitles = from_vtt(raw_vtt)
|
||||
scripts = [i for i in raw_script.split("\n") if i]
|
||||
print(f"Found {len(subtitles)} subtitles, {len(scripts)} scenes.\nTimestamping each words...")
|
||||
|
||||
scenes = []
|
||||
for n, script in enumerate(scripts):
|
||||
if len(script.split(" ")) == 1:
|
||||
|
@ -110,8 +125,7 @@ def create_word_scenes(wordvtt, scriptraw):
|
|||
if sentence == scenes[scenes_cur+1].get("scene"):
|
||||
scenes_cur += 1
|
||||
else:
|
||||
print(f"Error, Mismatch in scenes\n=>\"[{scenes_cur}] {scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\"")
|
||||
return
|
||||
raise Exception(f"Error, Failed to match sentence with scene.\n\"{scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\"")
|
||||
|
||||
current_scene = scenes[scenes_cur]
|
||||
if current_scene["timestamp"]:
|
||||
|
@ -120,9 +134,12 @@ def create_word_scenes(wordvtt, scriptraw):
|
|||
word_idx = 0
|
||||
|
||||
if ("<u>" not in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
|
||||
# Ignore trailing dummy subtitle after last word indexed.
|
||||
pass
|
||||
|
||||
if ("<u>" in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
|
||||
print(f"Error, index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}")
|
||||
# If there is trailing non-dummy timestamped subtitle, Reset word_idx and step to next scene. (Repeating sentence doesnt increment cur.)
|
||||
print(f"Error, Index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}")
|
||||
word_idx = 0
|
||||
scenes_cur += 1
|
||||
current_scene = scenes[scenes_cur]
|
||||
|
@ -132,26 +149,25 @@ def create_word_scenes(wordvtt, scriptraw):
|
|||
word_idx = 0
|
||||
print(f"Changed to {word_idx}, {scenes_cur}")
|
||||
|
||||
# Start matching words.
|
||||
if "<u>" in subtitle["content"]:
|
||||
word = subtitle["content"].split("<u>")[1].split("</u>")[0]
|
||||
|
||||
if word not in sentence.split(" "):
|
||||
print(f"Error, Mismatch\n=> \"{word}\" not in \"{sentence}\"")
|
||||
raise Exception(f"Error, Mismatch\n=> \"{word}\" not in \"{sentence}\"")
|
||||
return
|
||||
|
||||
try:
|
||||
assert sentence.split(" ")[word_idx] == word
|
||||
except:
|
||||
print(f"Error, Mismatch\n=> \"{word}\" != [{word_idx}] of \"{sentence}\"")
|
||||
return
|
||||
raise Exception(f"Error, Mismatch\n=> \"{word}\" != [{word_idx}] of \"{sentence}\"")
|
||||
|
||||
word_time = {"start": subtitle["start"], "end": subtitle["end"], "index": word_idx, "word": word}
|
||||
current_scene["timestamp"].append(word_time)
|
||||
|
||||
for scene in scenes:
|
||||
if len(scene["scene"].split(" ")) != len(scene["timestamp"]):
|
||||
print("Error, Mismatch length")
|
||||
return
|
||||
raise Exception("Error, Scene length and timestamp length doesnt match.")
|
||||
if "" in scene["scene"].split(" "):
|
||||
print(repr(scene["scene"]))
|
||||
|
||||
|
@ -163,13 +179,90 @@ def create_word_scenes(wordvtt, scriptraw):
|
|||
|
||||
for i, j in zip(full_script, full_scenes):
|
||||
if i.replace("##", "") != j["word"]:
|
||||
print("Error, Mismatch")
|
||||
raise Exception("Error, Mismatch")
|
||||
return
|
||||
|
||||
assert len(full_scenes) == len(full_script)
|
||||
|
||||
return full_script, full_scenes
|
||||
|
||||
def scene_from_new_script(raw_script, full_script, full_scenes):
|
||||
mod_script = raw_script.replace("\n", " \n ").split(" ")
|
||||
mod_script = [i for i in mod_script if i]
|
||||
n = 0
|
||||
while True:
|
||||
if mod_script[n] == "\n":
|
||||
mod_script[n-1] += "\n"
|
||||
del(mod_script[n])
|
||||
n -= 1
|
||||
n += 1
|
||||
if n == len(mod_script):
|
||||
break
|
||||
|
||||
print(f"Original: {len(full_script)}, Modded: {len(mod_script)}")
|
||||
allowed_list = [".", "\n", "\n\n", ",", "?", "##"]
|
||||
|
||||
def normalized(x):
|
||||
for i in allowed_list:
|
||||
x = x.replace(i, "")
|
||||
return x.upper()
|
||||
|
||||
same = lambda a, b: normalized(a) == normalized(b)
|
||||
new_script, new_timestamp, orig_index, n = [], [], 0, 0
|
||||
fail = 0
|
||||
while n < len(mod_script):
|
||||
print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}")
|
||||
word = mod_script[n]
|
||||
if same(word, full_script[orig_index].replace("##", "")):
|
||||
cur = full_scenes[orig_index]
|
||||
new_script.append(word.replace("##", ""))
|
||||
new_timestamp.append({"start": cur["start"], "end": cur["end"]})
|
||||
fail = 0
|
||||
else:
|
||||
if fail > 10:
|
||||
raise Exception("Error: Failed to match words,")
|
||||
return
|
||||
fail += 1
|
||||
n -= 1
|
||||
n, orig_index = n+1, orig_index+1
|
||||
assert len(new_script) == len(new_timestamp)
|
||||
return new_script, new_timestamp
|
||||
|
||||
def build_new_subtitle(new_script, new_timestamp):
|
||||
buffer, new_scenes, start, end = [], [], None, None
|
||||
current_scene = []
|
||||
|
||||
for i, j in zip(new_script, new_timestamp):
|
||||
buffer.append(i.replace("\n", ""))
|
||||
if not start:
|
||||
start = j["start"]
|
||||
|
||||
if "\n" in i:
|
||||
current_scene.append({"content": " ".join(buffer).replace("##", ""), "start": start, "end": j["end"]})
|
||||
buffer, start = [], None
|
||||
|
||||
if "\n\n" in i:
|
||||
print(f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"")
|
||||
new_scenes.append(current_scene)
|
||||
current_scene = []
|
||||
|
||||
if start:
|
||||
buffer.append(i.replace("\n", ""))
|
||||
current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]})
|
||||
|
||||
if current_scene != (new_scenes[-1] if new_scenes else None):
|
||||
new_scenes.append(current_scene)
|
||||
|
||||
newsub = []
|
||||
for n, i in enumerate(new_scenes):
|
||||
newsub += i
|
||||
if n < len(new_scenes) - 1:
|
||||
newsub.append({"content": "Break", "start": None, "end": None, "split": True})
|
||||
|
||||
return newsub
|
||||
|
||||
###
|
||||
|
||||
def autobreak(lines, times):
|
||||
from datetime import timedelta
|
||||
|
||||
|
@ -222,108 +315,39 @@ def autobreak(lines, times):
|
|||
|
||||
return script
|
||||
|
||||
def scene_from_new_script(raw_script, full_script, full_scenes):
|
||||
mod_script = raw_script.replace("\n", " \n ").split(" ")
|
||||
mod_script = [i for i in mod_script if i]
|
||||
n = 0
|
||||
while True:
|
||||
if mod_script[n] == "\n":
|
||||
mod_script[n-1] += "\n"
|
||||
del(mod_script[n])
|
||||
n -= 1
|
||||
n += 1
|
||||
if n == len(mod_script):
|
||||
break
|
||||
|
||||
print(f"Original: {len(full_script)}, Modded: {len(mod_script)}")
|
||||
allowed_list = [".", "\n", "\n\n", ",", "?", "##"]
|
||||
|
||||
def normalized(x):
|
||||
for i in allowed_list:
|
||||
x = x.replace(i, "")
|
||||
return x.upper()
|
||||
|
||||
same = lambda a, b: normalized(a) == normalized(b)
|
||||
new_script, new_timestamp, orig_index, n = [], [], 0, 0
|
||||
fail = 0
|
||||
while n < len(mod_script):
|
||||
print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}")
|
||||
word = mod_script[n]
|
||||
if same(word, full_script[orig_index].replace("##", "")):
|
||||
cur = full_scenes[orig_index]
|
||||
new_script.append(word.replace("##", ""))
|
||||
new_timestamp.append({"start": cur["start"], "end": cur["end"]})
|
||||
fail = 0
|
||||
else:
|
||||
if fail > 10:
|
||||
print("Error: Failed to match words,")
|
||||
return
|
||||
fail += 1
|
||||
n -= 1
|
||||
n, orig_index = n+1, orig_index+1
|
||||
assert len(new_script) == len(new_timestamp)
|
||||
return new_script, new_timestamp
|
||||
|
||||
def build_new_subtitle(new_script, new_timestamp):
|
||||
buffer, new_scenes, start, end = [], [], None, None
|
||||
current_scene = []
|
||||
|
||||
for i, j in zip(new_script, new_timestamp):
|
||||
if "\n" in i:
|
||||
buffer.append(i.replace("\n", ""))
|
||||
current_scene.append({"content": " ".join(buffer).replace("##", ""), "start": start, "end": j["end"]})
|
||||
buffer, start = [], None
|
||||
|
||||
if "\n\n" in i:
|
||||
print(f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"")
|
||||
new_scenes.append(current_scene)
|
||||
current_scene = []
|
||||
|
||||
else:
|
||||
buffer.append(i)
|
||||
if not start:
|
||||
start = j["start"]
|
||||
|
||||
if start:
|
||||
buffer.append(i.replace("\n", ""))
|
||||
current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]})
|
||||
|
||||
if current_scene != (new_scenes[-1] if new_scenes else None):
|
||||
new_scenes.append(current_scene)
|
||||
|
||||
newsub = []
|
||||
for n, i in enumerate(new_scenes):
|
||||
newsub += i
|
||||
if n < len(new_scenes) - 1:
|
||||
newsub.append({"content": "Break", "start": None, "end": None, "split": True})
|
||||
|
||||
return newsub
|
||||
|
||||
###
|
||||
############################################
|
||||
|
||||
def saveFile(filename, data, override = False):
|
||||
if os.path.exists(filename) and not override:
|
||||
print(f"File {filename} already exists.")
|
||||
return -1
|
||||
raise Exception(f"File {filename} already exists.")
|
||||
return
|
||||
with open(filename, "w") as f:
|
||||
f.write(data)
|
||||
|
||||
def openFile(filename):
|
||||
if not os.path.exists(filename):
|
||||
raise Exception(f"File {filename} doesnt exists.")
|
||||
return
|
||||
with open(filename, "r") as f:
|
||||
data = f.read()
|
||||
if not data:
|
||||
return -1
|
||||
raise Exception("Data empty.")
|
||||
return
|
||||
return data
|
||||
|
||||
###
|
||||
############################################
|
||||
|
||||
if __name__=="__main__":
|
||||
PROG = sys.argv[0].split("/")[-1]
|
||||
if len(sys.argv) not in (3, 4):
|
||||
PROG = sys.argv[0].split("/")[-1]
|
||||
print(f"Usage: {PROG} script [VTT file] \n" \
|
||||
f" {" "*len(PROG)} apply [VTT file] [script file] \n" \
|
||||
f" {" "*len(PROG)} create [JSON file]" \
|
||||
)
|
||||
print( \
|
||||
f"""Usage: {PROG} [COMMAND] [FILES]...
|
||||
|
||||
Commands:
|
||||
- script <VTT file> Generates script file from vtt file.
|
||||
- apply <VTT file> <script file> Applies new scripted file to create JSON file.
|
||||
- create <JSON file> Creates new vtt from given JSON.
|
||||
""")
|
||||
sys.exit()
|
||||
|
||||
COMMAND = sys.argv[1]
|
||||
|
@ -331,7 +355,7 @@ if __name__=="__main__":
|
|||
print("Error. Command not found.")
|
||||
sys.exit()
|
||||
|
||||
print(f"-> {PROG} {COMMAND} {FILE}")
|
||||
print(f"-> {sys.argv}")
|
||||
if COMMAND == "script":
|
||||
FILE = sys.argv[2]
|
||||
if (not os.path.exists(FILE)):
|
||||
|
@ -339,12 +363,12 @@ if __name__=="__main__":
|
|||
sys.exit(-1)
|
||||
|
||||
modfile = ".".join(scriptfile.split(".")[:-1]) + ".script"
|
||||
x = create_word_scenes(openFile(FILE), script_from_word_vtt(openFile(FILE)))
|
||||
x = create_word_scenes(openFile(FILE), "\n".join(script_from_word_vtt(openFile(FILE))))
|
||||
if not x:
|
||||
sys.exit(-1)
|
||||
|
||||
full_script, full_scenes = x
|
||||
genscript = autobreak(full_script,full_scenes)
|
||||
genscript = autobreak(full_script, full_scenes)
|
||||
saveFile(modfile, genscript)
|
||||
print(f"Saved script file {modfile}.")
|
||||
|
||||
|
@ -358,7 +382,7 @@ if __name__=="__main__":
|
|||
print(f"Input file doesnt exists.")
|
||||
sys.exit(-1)
|
||||
|
||||
x = create_word_scenes(openFile(FILE1), script_from_word_vtt(openFile(FILE)))
|
||||
x = create_word_scenes(openFile(FILE1), "\n".join(script_from_word_vtt(openFile(FILE1))))
|
||||
if not x:
|
||||
sys.exit(-1)
|
||||
full_script, full_scenes = x
|
||||
|
|
|
@ -0,0 +1,79 @@
|
|||
import json
|
||||
import os, sys
|
||||
|
||||
def readFile(file):
|
||||
if not os.path.exists(file):
|
||||
raise Exception(f"File {file} doesn't exists.")
|
||||
with open(file, "r") as f:
|
||||
data = f.read()
|
||||
return data
|
||||
|
||||
def writeFile(file, data, overwrite = False):
|
||||
if (not overwrite) and os.path.exists(file):
|
||||
raise Exception(f"File {file} already exists.")
|
||||
if not len(data):
|
||||
raise Exception(f"Tried to write empty data.")
|
||||
with open(file, "w") as f:
|
||||
ret = f.write(data)
|
||||
return ret
|
||||
|
||||
file = sys.argv[1]
|
||||
|
||||
if ".json" in file:
|
||||
subtitles = json.loads(readFile(file))
|
||||
output = ""
|
||||
index = 0
|
||||
for subtitle in subtitles:
|
||||
if subtitle.get("split", False):
|
||||
output += "\n"
|
||||
else:
|
||||
index += 1
|
||||
start = subtitle["start"]
|
||||
end = subtitle["end"]
|
||||
content = subtitle["content"]
|
||||
"| {start:>10} --> {end:>10} |"
|
||||
output += f"{index:03} | {content.strip()}\n"
|
||||
|
||||
output += "############ TIMESTAMPS ############\n\n"
|
||||
|
||||
index = 0
|
||||
for subtitle in subtitles:
|
||||
if not subtitle.get("split", False):
|
||||
index += 1
|
||||
start = subtitle["start"]
|
||||
end = subtitle["end"]
|
||||
output += f"{index:03} | {start} --> {end} \n"
|
||||
|
||||
writeFile(os.path.splitext(file)[0]+".edit", output)
|
||||
|
||||
elif ".edit" in file:
|
||||
subtitles = json.loads(readFile(os.path.splitext(file)[0]+".json"))
|
||||
lines = readFile(file)
|
||||
|
||||
idx, sub = 0, {}
|
||||
for subtitle in subtitles:
|
||||
if not subtitle.get("split", False):
|
||||
sub[idx] = subtitle
|
||||
idx += 1
|
||||
|
||||
new_brk, new_sub = [], {}
|
||||
for line in lines.split("\n"):
|
||||
if "\n############ TIMESTAMPS ############" == line:
|
||||
break
|
||||
if line:
|
||||
idx, content = line.split(" | ")
|
||||
idx = int(idx) - 1
|
||||
if sub[idx]["content"] != content:
|
||||
print(f"{idx} {sub[idx]["content"]} -> {content}")
|
||||
new_sub[idx] = {"content": content, "start": sub[idx]["start"], "end": sub[idx]["end"]}
|
||||
else:
|
||||
new_brk.append(idx)
|
||||
|
||||
output = []
|
||||
for n in sorted(new_sub):
|
||||
subtitle = new_sub[n]
|
||||
output.append(subtitle)
|
||||
if n in new_brk:
|
||||
output.append({"content": "Break", "start": None, "end": None, "split": True})
|
||||
|
||||
writeFile(os.path.splitext(file)[0]+".json.1", json.dumps(output, indent=2))
|
Loading…
Reference in New Issue