Fix

2024-02-17 08:40:45 +09:00 · 2024-02-17 08:40:45 +09:00 · 826aca5c9f
parent 368c26d491
commit 826aca5c9f
6 changed files with 1306 additions and 1056 deletions
--- a/backup.py
+++ b/backup.py
@ -1,11 +1,12 @@
 import json
 import re
 def parse_vtt(vtt_filename):
-    with open(vtt_filename, 'r', encoding='utf-8') as file:
+    with open(vtt_filename, "r", encoding="utf-8") as file:
        lines = file.readlines()
-    time_pattern = re.compile(r'(\d+\.\d{3}) --> (\d+\.\d{3})')
+    time_pattern = re.compile(r"(\d+\.\d{3}) --> (\d+\.\d{3})")
    subtitles = []
    current_subtitle = {}
@ -13,43 +14,47 @@ def parse_vtt(vtt_filename):
    for line in lines[1:]:
        match = time_pattern.match(line)
        if match:
-            current_subtitle['start'] = float(match.group(1))
+            current_subtitle["start"] = float(match.group(1))
-            current_subtitle['end'] = float(match.group(2))
+            current_subtitle["end"] = float(match.group(2))
-            current_subtitle['content'] = ""
+            current_subtitle["content"] = ""
-        elif line.strip() == '':
+        elif line.strip() == "":
            if current_subtitle:
-                if current_subtitle['content'][-1] == "\n":
+                if current_subtitle["content"][-1] == "\n":
-                    current_subtitle['content'] = current_subtitle['content'][:-1]
+                    current_subtitle["content"] = current_subtitle["content"][:-1]
                subtitles.append(current_subtitle)
                current_subtitle = {}
        else:
-            current_subtitle['content'] += line.strip() + "\n"  # Space to separate lines
+            current_subtitle["content"] += (
                line.strip() + "\n"
            )  # Space to separate lines
    if current_subtitle:
-        if current_subtitle['content'][-1] == "\n":
+        if current_subtitle["content"][-1] == "\n":
-            current_subtitle['content'] = current_subtitle['content'][:-1]
+            current_subtitle["content"] = current_subtitle["content"][:-1]
        subtitles.append(current_subtitle)
    return subtitles
 def subtitles_to_backup(subtitles):
 def subtitles_to_backup(subtitles):
    backup_data = {
-        "subtitles": subtitles, 
+        "subtitles": subtitles,
        "script_lines": [],
        "line_index": len(subtitles),
-        "current_subtitle": {}, 
+        "current_subtitle": {},
-        "play": 0
+        "play": 0,
    }
    return backup_data
 def main(vtt_filename, output_filename):
    subtitles = parse_vtt(vtt_filename)
    backup_data = subtitles_to_backup(subtitles)
-    with open(output_filename, 'w', encoding='utf-8') as json_file:
+    with open(output_filename, "w", encoding="utf-8") as json_file:
        json.dump(backup_data, json_file, indent=2)
-vtt_filename = 'audio.vtt'
+
-output_filename = 'backup2.json'
+vtt_filename = "audio.vtt"
 output_filename = "backup2.json"
 main(vtt_filename, output_filename)
--- a/snusub.py
+++ b/snusub.py
@ -6,410 +6,479 @@ from datetime import timedelta
 ###
 def from_vtt(vtt_string):
-  VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
+    VTT_TIMECODE_PATTERN = (
-  VTT_LINE_NUMBER_PATTERN = r"^\d+$"
+        r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
-  parts = re.split(r'\n\n+', vtt_string.strip())
+    )
-  if parts[0].startswith('WEBVTT'):
+    VTT_LINE_NUMBER_PATTERN = r"^\d+$"
-    parts.pop(0)
+    parts = re.split(r"\n\n+", vtt_string.strip())
    if parts[0].startswith("WEBVTT"):
        parts.pop(0)
-  subtitles = []
+    subtitles = []
-  for part in parts:
+    for part in parts:
-    lines = part.split('\n')
+        lines = part.split("\n")
-    match = re.match(VTT_TIMECODE_PATTERN, lines[0])
+        match = re.match(VTT_TIMECODE_PATTERN, lines[0])
-    if not match:
+        if not match:
-      if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]):
+            if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]):
-        lines.pop(0)
+                lines.pop(0)
-      match = re.match(VTT_TIMECODE_PATTERN, lines[0])
+            match = re.match(VTT_TIMECODE_PATTERN, lines[0])
-    if not match:
+        if not match:
-      continue
+            continue
-    start, end = match.groups()
+        start, end = match.groups()
-    content = '\n'.join(lines[1:]) + "\n"
+        content = "\n".join(lines[1:]) + "\n"
-    # if start == end:
+        # if start == end:
-    #   continue
+        #   continue
-      
+
-    subtitles.append({
+        subtitles.append(
-      'start': start,
+            {
-      'end': end,
+                "start": start,
-      'content': (content.replace("-\n", "\n").replace("</u>-\n", "</u>\n").replace("-", " ").replace("%", " ").replace("<u> "," <u>").replace(" </u>","</u> ").replace("<u> </u>","").replace("<u></u>","").replace(" \n", "\n"))[:-1]
+                "end": end,
-    })
+                "content": (
                    content.replace("-\n", "\n")
                    .replace("</u>-\n", "</u>\n")
                    .replace("-", " ")
                    .replace("%", " ")
                    .replace("<u> ", " <u>")
                    .replace(" </u>", "</u> ")
                    .replace("<u> </u>", "")
                    .replace("<u></u>", "")
                    .replace(" \n", "\n")
                )[:-1],
            }
        )
    return subtitles
  return subtitles
 def to_vtt(subtitles):
    vtt_content = "WEBVTT\n\n\n"
    for idx, subtitle in enumerate(subtitles):
-        content = subtitle['content']
+        content = subtitle["content"]
        if not subtitle.get("split", False):
-          start = subtitle['start']
+            start = subtitle["start"]
-          end = subtitle['end']
+            end = subtitle["end"]
-          if not start or not end or start == end:
+            if not start or not end or start == end:
-            raise Exception(f"VTT timestamp parse error from #{idx}.")
+                raise Exception(f"VTT timestamp parse error from #{idx}.")
-          vtt_content += f"{start} --> {end}\n{content}\n\n\n"
+            vtt_content += f"{start} --> {end}\n{content}\n\n\n"
        else:
-          vtt_content += f"NOTE {content}\n\n\n"
+            vtt_content += f"NOTE {content}\n\n\n"
    return vtt_content.strip()
 def to_stacked_vtt(subtitles, continous = True):
  vtt_content = "WEBVTT\n\n\n"
  buffer = ""
  for n, subtitle in enumerate(subtitles):
    if subtitle.get("split", False):
      buffer = ""
      continue
-    if len(buffer) != 0:
+def to_stacked_vtt(subtitles, continous=True):
-      if str(subtitle['content'].strip())[-1] == ".":
+    vtt_content = "WEBVTT\n\n\n"
-        buffer += "\n"
+    buffer = ""
-      else:
+    for n, subtitle in enumerate(subtitles):
-        buffer += " "
+        if subtitle.get("split", False):
            buffer = ""
            continue
-    buffer += subtitle['content'].strip()
+        if len(buffer) != 0:
            if str(subtitle["content"].strip())[-1] == ".":
                buffer += "\n"
            else:
                buffer += " "
-    if n < len(subtitles) - 1:
+        buffer += subtitle["content"].strip()
      end_time = subtitles[n+1]['start'] if continous and not subtitles[n+1].get("split", False) else subtitle['end']
    else:
      end_time = subtitle['end']
    if not subtitle['start'] or not end_time:
      raise Exception(f"VTT timestamp parse error from #{idx}.")
    if subtitle['start'] == end_time:
      raise Exception(f"Error, subtitle timestamp overlaps.\n{subtitle['start']} --> {end_time} {subtitle['content'].strip()}")
    vtt_content += f"{subtitle['start']} --> {end_time}\n"
    vtt_content += buffer
    vtt_content += "\n\n\n"
-    print(f"{subtitle['start']} --> {end_time}\n{buffer}\n\n")
+        if n < len(subtitles) - 1:
            end_time = (
                subtitles[n + 1]["start"]
                if continous and not subtitles[n + 1].get("split", False)
                else subtitle["end"]
            )
        else:
            end_time = subtitle["end"]
        if not subtitle["start"] or not end_time:
            raise Exception(f"VTT timestamp parse error from #{idx}.")
        if subtitle["start"] == end_time:
            raise Exception(
                f"Error, subtitle timestamp overlaps.\n{subtitle['start']} --> {end_time} {subtitle['content'].strip()}"
            )
        vtt_content += f"{subtitle['start']} --> {end_time}\n"
        vtt_content += buffer
        vtt_content += "\n\n\n"
        print(f"{subtitle['start']} --> {end_time}\n{buffer}\n\n")
    return vtt_content
  return vtt_content
 ###
 def script_from_word_vtt(wordvtt):
-  subtitles = from_vtt(wordvtt)
+    subtitles = from_vtt(wordvtt)
-  print(f"Generating script file from VTT...")
+    print(f"Generating script file from VTT...")
-  sentences = []
+    sentences = []
-  ADD_NEXT_SENTENCE = 0
+    ADD_NEXT_SENTENCE = 0
-  for n, subtitle in enumerate(subtitles):
+    for n, subtitle in enumerate(subtitles):
-    sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
+        sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
-    if ((sentences[-1] if sentences else None) != sentence) or ADD_NEXT_SENTENCE:
+        if ((sentences[-1] if sentences else None) != sentence) or ADD_NEXT_SENTENCE:
-      sentences.append(sentence)
+            sentences.append(sentence)
-      ADD_NEXT_SENTENCE = 0
+            ADD_NEXT_SENTENCE = 0
-    if subtitle["content"][-4:] == "</u>":
+        if subtitle["content"][-4:] == "</u>":
-      ADD_NEXT_SENTENCE = 1
+            ADD_NEXT_SENTENCE = 1
-      if n + 2 < len(subtitles):
+            if n + 2 < len(subtitles):
-        if subtitles[n+2]["content"].replace("<u>", "").replace("</u>", "") != sentence:
+                if (
-          ADD_NEXT_SENTENCE = 0
+                    subtitles[n + 2]["content"].replace("<u>", "").replace("</u>", "")
-  return sentences
+                    != sentence
                ):
                    ADD_NEXT_SENTENCE = 0
    return sentences
 def create_word_scenes(raw_vtt, raw_script):
-  subtitles = from_vtt(raw_vtt)
+    subtitles = from_vtt(raw_vtt)
-  scripts   = [i for i in raw_script.split("\n") if i]
+    scripts = [i for i in raw_script.split("\n") if i]
-  print(f"Found {len(subtitles)} subtitles, {len(scripts)} scenes.\nTimestamping each words...")
+    print(
        f"Found {len(subtitles)} subtitles, {len(scripts)} scenes.\nTimestamping each words..."
    )
-  scenes = []
+    scenes = []
-  for n, script in enumerate(scripts):
+    for n, script in enumerate(scripts):
-    if len(script.split(" ")) == 1:
+        if len(script.split(" ")) == 1:
-      continue
+            continue
-    scenes.append({"scene": script, "timestamp": []})
+        scenes.append({"scene": script, "timestamp": []})
-  scenes_cur = 0
+    scenes_cur = 0
-  for n, subtitle in enumerate(subtitles):
+    for n, subtitle in enumerate(subtitles):
-    sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
+        sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
-    if len(sentence.split(" ")) == 1:
+        if len(sentence.split(" ")) == 1:
-      continue
+            continue
-    if sentence != scenes[scenes_cur].get("scene"):
+        if sentence != scenes[scenes_cur].get("scene"):
-      if sentence == scenes[scenes_cur+1].get("scene"):
+            if sentence == scenes[scenes_cur + 1].get("scene"):
-        scenes_cur += 1
+                scenes_cur += 1
-      else:
+            else:
-        raise Exception(f"Error, Failed to match sentence with scene.\n\"{scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\"")
+                raise Exception(
                    f"Error, Failed to match sentence with scene.\n\"{scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\""
                )
-    current_scene = scenes[scenes_cur]
+        current_scene = scenes[scenes_cur]
-    if current_scene["timestamp"]:
+        if current_scene["timestamp"]:
-      word_idx = current_scene["timestamp"][-1]["index"] + 1
+            word_idx = current_scene["timestamp"][-1]["index"] + 1
-    else:
+        else:
-      word_idx = 0
+            word_idx = 0
-    if ("<u>" not in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
+        if ("<u>" not in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
-      # Ignore trailing dummy subtitle after last word indexed.
+            # Ignore trailing dummy subtitle after last word indexed.
-      pass
+            pass
-    if ("<u>" in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
+        if ("<u>" in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
-      # If there is trailing non-dummy timestamped subtitle, Reset word_idx and step to next scene. (Repeating sentence doesnt increment cur.)
+            # If there is trailing non-dummy timestamped subtitle, Reset word_idx and step to next scene. (Repeating sentence doesnt increment cur.)
-      print(f"Error, Index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}")
+            print(
-      word_idx = 0
+                f"Error, Index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}"
-      scenes_cur += 1
+            )
-      current_scene = scenes[scenes_cur]
+            word_idx = 0
-      if current_scene["timestamp"]:
+            scenes_cur += 1
-        word_idx = current_scene["timestamp"][-1]["index"] + 1
+            current_scene = scenes[scenes_cur]
-      else:
+            if current_scene["timestamp"]:
-        word_idx = 0
+                word_idx = current_scene["timestamp"][-1]["index"] + 1
-      print(f"Changed to {word_idx}, {scenes_cur}")
+            else:
                word_idx = 0
            print(f"Changed to {word_idx}, {scenes_cur}")
-    # Start matching words.
+        # Start matching words.
-    if "<u>" in subtitle["content"]:
+        if "<u>" in subtitle["content"]:
-      word = subtitle["content"].split("<u>")[1].split("</u>")[0]
+            word = subtitle["content"].split("<u>")[1].split("</u>")[0]
-      if word not in sentence.split(" "):
+            if word not in sentence.split(" "):
-        raise Exception(f"Error, Mismatch\n=> \"{word}\" not in \"{sentence}\"")
+                raise Exception(f'Error, Mismatch\n=> "{word}" not in "{sentence}"')
-        return
+                return
-      try:
+            try:
-        assert sentence.split(" ")[word_idx] == word
+                assert sentence.split(" ")[word_idx] == word
-      except:
+            except:
-        raise Exception(f"Error, Mismatch\n=> \"{word}\" != [{word_idx}] of \"{sentence}\"")
+                raise Exception(
                    f'Error, Mismatch\n=> "{word}" != [{word_idx}] of "{sentence}"'
                )
-      word_time = {"start": subtitle["start"], "end": subtitle["end"], "index": word_idx, "word": word}
+            word_time = {
-      current_scene["timestamp"].append(word_time)
+                "start": subtitle["start"],
                "end": subtitle["end"],
                "index": word_idx,
                "word": word,
            }
            current_scene["timestamp"].append(word_time)
-  for scene in scenes:
+    for scene in scenes:
-    if len(scene["scene"].split(" ")) != len(scene["timestamp"]):
+        if len(scene["scene"].split(" ")) != len(scene["timestamp"]):
-      raise Exception("Error, Scene length and timestamp length doesnt match.")
+            raise Exception("Error, Scene length and timestamp length doesnt match.")
-    if "" in scene["scene"].split(" "):
+        if "" in scene["scene"].split(" "):
-      print(repr(scene["scene"]))
+            print(repr(scene["scene"]))
-  full_script, full_scenes = [], []
+    full_script, full_scenes = [], []
-  for scene in scenes:
+    for scene in scenes:
-    full_script += scene["scene"].split(" ")[:-1]
+        full_script += scene["scene"].split(" ")[:-1]
-    full_script.append(scene["scene"].split(" ")[-1]+"##")
+        full_script.append(scene["scene"].split(" ")[-1] + "##")
-    full_scenes += scene["timestamp"]
+        full_scenes += scene["timestamp"]
-  for i, j in zip(full_script, full_scenes):
+    for i, j in zip(full_script, full_scenes):
-    if i.replace("##", "") != j["word"]:
+        if i.replace("##", "") != j["word"]:
-      raise Exception("Error, Mismatch")
+            raise Exception("Error, Mismatch")
-      return
+            return
    assert len(full_scenes) == len(full_script)
    return full_script, full_scenes
  assert len(full_scenes) == len(full_script)
  return full_script, full_scenes
 def scene_from_new_script(raw_script, full_script, full_scenes):
-  mod_script = raw_script.replace("\n", " \n ").split(" ")
+    mod_script = raw_script.replace("\n", " \n ").split(" ")
-  mod_script = [i for i in mod_script if i]
+    mod_script = [i for i in mod_script if i]
-  n = 0
+    n = 0
-  while True:
+    while True:
-    if mod_script[n] == "\n":
+        if mod_script[n] == "\n":
-      mod_script[n-1] += "\n"
+            mod_script[n - 1] += "\n"
-      del(mod_script[n])
+            del mod_script[n]
-      n -= 1
+            n -= 1
-    n += 1
+        n += 1
-    if n == len(mod_script):
+        if n == len(mod_script):
-      break
+            break
-  
+
-  print(f"Original: {len(full_script)}, Modded: {len(mod_script)}")
+    print(f"Original: {len(full_script)}, Modded: {len(mod_script)}")
-  allowed_list = [".", "\n", "\n\n", ",", "?", "##"]
+    allowed_list = [".", "\n", "\n\n", ",", "?", "##"]
    def normalized(x):
        for i in allowed_list:
            x = x.replace(i, "")
        return x.upper()
    same = lambda a, b: normalized(a) == normalized(b)
    new_script, new_timestamp, orig_index, n = [], [], 0, 0
    fail = 0
    while n < len(mod_script):
        print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}")
        word = mod_script[n]
        if same(word, full_script[orig_index].replace("##", "")):
            cur = full_scenes[orig_index]
            new_script.append(word.replace("##", ""))
            new_timestamp.append({"start": cur["start"], "end": cur["end"]})
            fail = 0
        else:
            if fail > 10:
                raise Exception("Error: Failed to match words,")
                return
            fail += 1
            n -= 1
        n, orig_index = n + 1, orig_index + 1
    assert len(new_script) == len(new_timestamp)
    return new_script, new_timestamp
  def normalized(x):
    for i in allowed_list:
      x = x.replace(i, "")
    return x.upper()
  same = lambda a, b: normalized(a) == normalized(b)
  new_script, new_timestamp, orig_index, n = [], [], 0, 0
  fail = 0
  while n < len(mod_script):
    print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}")
    word = mod_script[n]
    if same(word, full_script[orig_index].replace("##", "")):
      cur = full_scenes[orig_index]
      new_script.append(word.replace("##", ""))
      new_timestamp.append({"start": cur["start"], "end": cur["end"]})
      fail = 0
    else:
      if fail > 10:
        raise Exception("Error: Failed to match words,")
        return
      fail += 1
      n -= 1
    n, orig_index = n+1, orig_index+1
  assert len(new_script) == len(new_timestamp)
  return new_script, new_timestamp
 def build_new_subtitle(new_script, new_timestamp):
-  buffer, new_scenes, start, end = [], [], None, None
+    buffer, new_scenes, start, end = [], [], None, None
-  current_scene = []
+    current_scene = []
  for i, j in zip(new_script, new_timestamp):
    buffer.append(i.replace("\n", ""))
    if not start:
      start = j["start"]
-    if "\n" in i:
+    for i, j in zip(new_script, new_timestamp):
-      current_scene.append({"content": " ".join(buffer).replace("##", ""), "start": start, "end": j["end"]})
+        buffer.append(i.replace("\n", ""))
-      buffer, start = [], None
+        if not start:
- 
+            start = j["start"]
    if "\n\n" in i:
      print(f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"")
      new_scenes.append(current_scene)
      current_scene = []
-  if start:
+        if "\n" in i:
-      buffer.append(i.replace("\n", ""))
+            current_scene.append(
-      current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]})
+                {
                    "content": " ".join(buffer).replace("##", ""),
                    "start": start,
                    "end": j["end"],
                }
            )
            buffer, start = [], None
-  if current_scene != (new_scenes[-1] if new_scenes else None):
+        if "\n\n" in i:
-    new_scenes.append(current_scene)
+            print(
                f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\""
            )
            new_scenes.append(current_scene)
            current_scene = []
-  newsub = []
+    if start:
-  for n, i in enumerate(new_scenes):
+        buffer.append(i.replace("\n", ""))
-    newsub += i
+        current_scene.append(
-    if n < len(new_scenes) - 1:
+            {"content": " ".join(buffer), "start": start, "end": j["end"]}
-      newsub.append({"content": "Break", "start": None, "end": None, "split": True})
+        )
    if current_scene != (new_scenes[-1] if new_scenes else None):
        new_scenes.append(current_scene)
    newsub = []
    for n, i in enumerate(new_scenes):
        newsub += i
        if n < len(new_scenes) - 1:
            newsub.append(
                {"content": "Break", "start": None, "end": None, "split": True}
            )
    return newsub
  return newsub
 ###
 def autobreak(lines, times):
-  from datetime import timedelta
+    from datetime import timedelta
-  def parsetime(time_str):
+    def parsetime(time_str):
-    minutes, seconds = time_str.split(':')
+        minutes, seconds = time_str.split(":")
-    seconds, milliseconds = seconds.split('.')
+        seconds, milliseconds = seconds.split(".")
-    td = timedelta(minutes=int(minutes), seconds=int(seconds), milliseconds=int(milliseconds))
+        td = timedelta(
-    return td
+            minutes=int(minutes), seconds=int(seconds), milliseconds=int(milliseconds)
        )
        return td
-  script = []
+    script = []
-  long_breaks = []
+    long_breaks = []
-  tmark = parsetime("0:0.0")
+    tmark = parsetime("0:0.0")
-  for i, j in zip(lines, times):
+    for i, j in zip(lines, times):
-    tdiff = parsetime(j["start"]) - tmark
+        tdiff = parsetime(j["start"]) - tmark
-    tmark = parsetime(j["end"])
+        tmark = parsetime(j["end"])
-    if tdiff > parsetime("0:0.0"):
+        if tdiff > parsetime("0:0.0"):
-      long_breaks.append(tdiff)
+            long_breaks.append(tdiff)
-  mean_break = parsetime("0:0.0")
+    mean_break = parsetime("0:0.0")
-  for i in long_breaks:
+    for i in long_breaks:
-    mean_break += i/len(long_breaks)
+        mean_break += i / len(long_breaks)
-  print(mean_break)
+    print(mean_break)
-  script = ""
+    script = ""
-  tmark = parsetime("0:0.0")
+    tmark = parsetime("0:0.0")
-  tmp = " "
+    tmp = " "
-  continous_line = 0
+    continous_line = 0
-  for i, j in zip(lines, times):
+    for i, j in zip(lines, times):
-    tdiff = parsetime(j["start"]) - tmark
+        tdiff = parsetime(j["start"]) - tmark
-    tmark = parsetime(j["end"])
+        tmark = parsetime(j["end"])
-    if tdiff > mean_break and tmp[-1] != ".":
+        if tdiff > mean_break and tmp[-1] != ".":
-      script += "\n"
+            script += "\n"
-    if (tdiff >= mean_break and tmp[-1] == "."):
+        if tdiff >= mean_break and tmp[-1] == ".":
-        script += "\n"
+            script += "\n"
-        continous_line = 0
+            continous_line = 0
-    else:
+        else:
-      continous_line += 1
+            continous_line += 1
-    script += i.replace("##", "")
+        script += i.replace("##", "")
        if i[-1] == ".":
            script += "\n"
        elif "##" in i:
            script += "\n"
        else:
            script += " "
        tmp = i
    return script
    if i[-1] == ".":
      script += "\n"
    elif "##" in i:
        script += "\n"
    else:
      script += " " 
    tmp = i
  return script
 ############################################
-def saveFile(filename, data, override = False):
+
-  if os.path.exists(filename) and not override:
+def saveFile(filename, data, override=False):
-    raise Exception(f"File {filename} already exists.")
+    if os.path.exists(filename) and not override:
-    return
+        raise Exception(f"File {filename} already exists.")
-  with open(filename, "w") as f:
+        return
-    f.write(data)
+    with open(filename, "w") as f:
        f.write(data)
 def openFile(filename):
-  if not os.path.exists(filename):
+    if not os.path.exists(filename):
-    raise Exception(f"File {filename} doesnt exists.")
+        raise Exception(f"File {filename} doesnt exists.")
-    return
+        return
-  with open(filename, "r") as f:
+    with open(filename, "r") as f:
-    data = f.read()
+        data = f.read()
-  if not data:
+    if not data:
-    raise Exception("Data empty.")
+        raise Exception("Data empty.")
-    return
+        return
-  return data
+    return data
 ############################################
-if __name__=="__main__":
+if __name__ == "__main__":
-  PROG = sys.argv[0].split("/")[-1]
+    PROG = sys.argv[0].split("/")[-1]
-  if len(sys.argv) not in (3, 4):
+    if len(sys.argv) not in (3, 4):
-    print( \
+        print(
-f"""Usage: {PROG} [COMMAND] [FILES]...
+            f"""Usage: {PROG} [COMMAND] [FILES]...
 Commands:
 - script   <VTT file>                    Generates script file from vtt file.
 - apply    <VTT file> <script file>      Applies new scripted file to create JSON file.
 - create   <JSON file>                   Creates new vtt from given JSON.
- """)                              
+ """
-    sys.exit()
+        )
        sys.exit()
-  COMMAND = sys.argv[1]
+    COMMAND = sys.argv[1]
-  if COMMAND not in ["script", "apply", "create"]:
+    if COMMAND not in ["script", "apply", "create"]:
-    print("Error. Command not found.")
+        print("Error. Command not found.")
-    sys.exit()
+        sys.exit()
-  print(f"-> {sys.argv}")
+    print(f"-> {sys.argv}")
-  if COMMAND == "script":
+    if COMMAND == "script":
-    FILE = sys.argv[2]
+        FILE = sys.argv[2]
-    if (not os.path.exists(FILE)):
+        if not os.path.exists(FILE):
-      print(f"Input file doesnt exists.")
+            print(f"Input file doesnt exists.")
-      sys.exit(-1)
+            sys.exit(-1)
-    modfile = ".".join(scriptfile.split(".")[:-1]) + ".script"
+        modfile = ".".join(scriptfile.split(".")[:-1]) + ".script"
-    x = create_word_scenes(openFile(FILE), "\n".join(script_from_word_vtt(openFile(FILE))))
+        x = create_word_scenes(
-    if not x:
+            openFile(FILE), "\n".join(script_from_word_vtt(openFile(FILE)))
-      sys.exit(-1)
+        )
        if not x:
            sys.exit(-1)
-    full_script, full_scenes = x
+        full_script, full_scenes = x
-    genscript = autobreak(full_script, full_scenes)
+        genscript = autobreak(full_script, full_scenes)
-    saveFile(modfile, genscript)
+        saveFile(modfile, genscript)
-    print(f"Saved script file {modfile}.")
+        print(f"Saved script file {modfile}.")
  elif COMMAND == "apply":
    if len(sys.argv) != 4:
      print(f"Not sufficient input.")
      sys.exit()
-    FILE1, FILE2 = sys.argv[2], sys.argv[3]
+    elif COMMAND == "apply":
-    if (not os.path.exists(FILE1)) or (not os.path.exists(FILE2)):
+        if len(sys.argv) != 4:
-      print(f"Input file doesnt exists.")
+            print(f"Not sufficient input.")
-      sys.exit(-1)
+            sys.exit()
-    x = create_word_scenes(openFile(FILE1), "\n".join(script_from_word_vtt(openFile(FILE1))))
+        FILE1, FILE2 = sys.argv[2], sys.argv[3]
-    if not x:
+        if (not os.path.exists(FILE1)) or (not os.path.exists(FILE2)):
-      sys.exit(-1)
+            print(f"Input file doesnt exists.")
-    full_script, full_scenes = x
+            sys.exit(-1)
-    x = scene_from_new_script(openFile(FILE2), full_script, full_scenes)
+        x = create_word_scenes(
-    if not x:
+            openFile(FILE1), "\n".join(script_from_word_vtt(openFile(FILE1)))
-      sys.exit(-1)
+        )
-    a, b = x
+        if not x:
            sys.exit(-1)
        full_script, full_scenes = x
-    final_sub = build_new_subtitle(a, b)
+        x = scene_from_new_script(openFile(FILE2), full_script, full_scenes)
-    jsonfile = ".".join(FILE1.split(".")[:-1]) + ".json"
+        if not x:
-    saveFile(jsonfile, json.dumps(final_sub, indent=2), True)
+            sys.exit(-1)
-    print(f"Saved JSON file {jsonfile}.")
+        a, b = x
    sys.exit(0)
  elif COMMAND == "create":
    FILE = sys.argv[2]
    if (not os.path.exists(FILE)):
      print(f"Input file doesnt exists.")
      sys.exit(-1)
-    final_vtt = json.loads(openFile(FILE))
+        final_sub = build_new_subtitle(a, b)
-    orgf = ".".join(FILE.split(".")[:-1])
+        jsonfile = ".".join(FILE1.split(".")[:-1]) + ".json"
-    print(f"Saved VTT file as {orgf}.final.vtt.")
+        saveFile(jsonfile, json.dumps(final_sub, indent=2), True)
        print(f"Saved JSON file {jsonfile}.")
        sys.exit(0)
-    if os.path.exists(orgf + ".vtt"):
+    elif COMMAND == "create":
-      saveFile(orgf + ".stacked.vtt", to_stacked_vtt(final_vtt), True)
+        FILE = sys.argv[2]
-    else:
+        if not os.path.exists(FILE):
-      saveFile(orgf + ".vtt", to_stacked_vtt(final_vtt), True)
+            print(f"Input file doesnt exists.")
-    sys.exit(0)
+            sys.exit(-1)
        final_vtt = json.loads(openFile(FILE))
        orgf = ".".join(FILE.split(".")[:-1])
        print(f"Saved VTT file as {orgf}.final.vtt.")
        if os.path.exists(orgf + ".vtt"):
            saveFile(orgf + ".stacked.vtt", to_stacked_vtt(final_vtt), True)
        else:
            saveFile(orgf + ".vtt", to_stacked_vtt(final_vtt), True)
        sys.exit(0)
--- a/stackvtt.py
+++ b/stackvtt.py
@ -1,18 +1,21 @@
 import re
 from datetime import timedelta
-VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
+VTT_TIMECODE_PATTERN = (
    r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
 )
 VTT_LINE_NUMBER_PATTERN = r"^\d+$"
 def from_vtt(vtt_string):
    parts = re.split(r'\n\n+', vtt_string.strip())
-    if parts[0].startswith('WEBVTT'):
+def from_vtt(vtt_string):
    parts = re.split(r"\n\n+", vtt_string.strip())
    if parts[0].startswith("WEBVTT"):
        parts.pop(0)
    subtitles = []
    for part in parts:
-        lines = part.split('\n')
+        lines = part.split("\n")
        match = re.match(VTT_TIMECODE_PATTERN, lines[0])
        if not match:
            if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]):
@ -22,30 +25,28 @@ def from_vtt(vtt_string):
            continue
        start, end = match.groups()
-        content = '\n'.join(lines[1:])
+        content = "\n".join(lines[1:])
-        subtitles.append({
+        subtitles.append({"start": start, "end": end, "content": content})
            'start': start,
            'end': end,
            'content': content
        })
    return subtitles
 def to_vtt(subtitles):
    vtt_content = "WEBVTT\n\n"
    for idx, subtitle in enumerate(subtitles):
-        start = subtitle['start']
+        start = subtitle["start"]
-        end = subtitle['end']
+        end = subtitle["end"]
-        content = subtitle['content']
+        content = subtitle["content"]
        vtt_content += f"{start} --> {end}\n{content}\n\n"
    return vtt_content.strip()
 def stack_subtitle():
    buffer = []
    linebuf = []
    for line in parsed_vtt:
-        print(line["content"].strip()) 
+        print(line["content"].strip())
        content = line["content"].strip()
        if True:
            linebuf.append(line)
@ -62,12 +63,13 @@ def stack_subtitle():
            # if scene["content"][-1] == ".":
            strbuf += "\n"
            # else:
-                # strbuf += " "
+            # strbuf += " "
            scene["content"] = strbuf
            sub.append(scene)
 with open("example.vtt", "r") as f:
    vtt_content = f.read()
 parsed_vtt = from_vtt(vtt_content)
-print(to_vtt(stack_subtitle(parsed_vtt)))
+print(to_vtt(stack_subtitle(parsed_vtt)))
--- a/subedit.py
+++ b/subedit.py
@ -1,79 +1,88 @@
 import json
 import os, sys
 def readFile(file):
  if not os.path.exists(file):
    raise Exception(f"File {file} doesn't exists.")
  with open(file, "r") as f:
    data = f.read()
  return data
-def writeFile(file, data, overwrite = False):
+def readFile(file):
-  if (not overwrite) and os.path.exists(file):
+    if not os.path.exists(file):
-    raise Exception(f"File {file} already exists.")
+        raise Exception(f"File {file} doesn't exists.")
-  if not len(data):
+    with open(file, "r") as f:
-    raise Exception(f"Tried to write empty data.")
+        data = f.read()
-  with open(file, "w") as f:
+    return data
-    ret = f.write(data)
+
-  return ret
+
 def writeFile(file, data, overwrite=False):
    if (not overwrite) and os.path.exists(file):
        raise Exception(f"File {file} already exists.")
    if not len(data):
        raise Exception(f"Tried to write empty data.")
    with open(file, "w") as f:
        ret = f.write(data)
    return ret
 file = sys.argv[1]
 if ".json" in file:
-  subtitles = json.loads(readFile(file))
+    subtitles = json.loads(readFile(file))
-  output = ""
+    output = ""
-  index = 0
+    index = 0
-  for subtitle in subtitles:
+    for subtitle in subtitles:
-    if subtitle.get("split", False):
+        if subtitle.get("split", False):
-      output += "\n"
+            output += "\n"
-    else:
+        else:
-      index += 1
+            index += 1
-      start = subtitle["start"]
+            start = subtitle["start"]
-      end = subtitle["end"]
+            end = subtitle["end"]
-      content = subtitle["content"]
+            content = subtitle["content"]
-      "| {start:>10} --> {end:>10} |"
+            "| {start:>10} --> {end:>10} |"
-      output += f"{index:03} | {content.strip()}\n"
+            output += f"{index:03} | {content.strip()}\n"
-  output += "############ TIMESTAMPS ############\n\n"
+    output += "############ TIMESTAMPS ############\n\n"
-  index = 0
+    index = 0
-  for subtitle in subtitles:
+    for subtitle in subtitles:
-    if not subtitle.get("split", False):
+        if not subtitle.get("split", False):
-      index += 1
+            index += 1
-      start = subtitle["start"]
+            start = subtitle["start"]
-      end = subtitle["end"]
+            end = subtitle["end"]
-      output += f"{index:03} | {start} --> {end} \n"
+            output += f"{index:03} | {start} --> {end} \n"
-  writeFile(os.path.splitext(file)[0]+".edit", output)
+    writeFile(os.path.splitext(file)[0] + ".edit", output)
 elif ".edit" in file:
-  subtitles = json.loads(readFile(os.path.splitext(file)[0]+".json"))
+    subtitles = json.loads(readFile(os.path.splitext(file)[0] + ".json"))
-  lines = readFile(file)
+    lines = readFile(file)
-  idx, sub = 0, {}
+    idx, sub = 0, {}
-  for subtitle in subtitles:
+    for subtitle in subtitles:
-    if not subtitle.get("split", False):
+        if not subtitle.get("split", False):
-      sub[idx] = subtitle
+            sub[idx] = subtitle
-      idx += 1
+            idx += 1
-  new_brk, new_sub = [], {}
+    new_brk, new_sub = [], {}
-  for line in lines.split("\n"):
+    for line in lines.split("\n"):
-    if "\n############ TIMESTAMPS ############" == line:
+        if "\n############ TIMESTAMPS ############" == line:
-      break
+            break
-    if line:
+        if line:
-      idx, content = line.split(" | ")
+            idx, content = line.split(" | ")
-      idx = int(idx) - 1
+            idx = int(idx) - 1
-      if sub[idx]["content"] != content:
+            if sub[idx]["content"] != content:
-        print(f"{idx} {sub[idx]["content"]} -> {content}")
+                print(f"{idx} {sub[idx]["content"]} -> {content}")
-      new_sub[idx] = {"content": content, "start": sub[idx]["start"], "end": sub[idx]["end"]}
+            new_sub[idx] = {
-    else:
+                "content": content,
-      new_brk.append(idx)
+                "start": sub[idx]["start"],
                "end": sub[idx]["end"],
            }
        else:
            new_brk.append(idx)
-  output = []
+    output = []
-  for n in sorted(new_sub):
+    for n in sorted(new_sub):
-    subtitle = new_sub[n]
+        subtitle = new_sub[n]
-    output.append(subtitle)
+        output.append(subtitle)
-    if n in new_brk:
+        if n in new_brk:
-      output.append({"content": "Break", "start": None, "end": None, "split": True})
+            output.append(
                {"content": "Break", "start": None, "end": None, "split": True}
            )
-  writeFile(os.path.splitext(file)[0]+".json.1", json.dumps(output, indent=2))
+    writeFile(os.path.splitext(file)[0] + ".json.1", json.dumps(output, indent=2))
--- a/vttmaker.py
+++ b/vttmaker.py
--- a/wordvtt.py
+++ b/wordvtt.py
@ -4,50 +4,64 @@ import re, json
 import os
 from datetime import timedelta
 def from_vtt(vtt_string):
-  VTT_TIMECODE_PATTERN = r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
+    VTT_TIMECODE_PATTERN = (
-  VTT_LINE_NUMBER_PATTERN = r"^\d+$"
+        r"((?:\d{2}:)?\d{2}:\d{2}\.\d{3}) --> ((?:\d{2}:)?\d{2}:\d{2}\.\d{3})"
-  parts = re.split(r'\n\n+', vtt_string.strip())
+    )
-  if parts[0].startswith('WEBVTT'):
+    VTT_LINE_NUMBER_PATTERN = r"^\d+$"
-    parts.pop(0)
+    parts = re.split(r"\n\n+", vtt_string.strip())
    if parts[0].startswith("WEBVTT"):
        parts.pop(0)
-  subtitles = []
+    subtitles = []
-  for part in parts:
+    for part in parts:
-    lines = part.split('\n')
+        lines = part.split("\n")
-    match = re.match(VTT_TIMECODE_PATTERN, lines[0])
+        match = re.match(VTT_TIMECODE_PATTERN, lines[0])
-    if not match:
+        if not match:
-      if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]):
+            if re.match(VTT_LINE_NUMBER_PATTERN, lines[0]):
-        lines.pop(0)
+                lines.pop(0)
-      match = re.match(VTT_TIMECODE_PATTERN, lines[0])
+            match = re.match(VTT_TIMECODE_PATTERN, lines[0])
-    if not match:
+        if not match:
-      continue
+            continue
-    start, end = match.groups()
+        start, end = match.groups()
-    content = '\n'.join(lines[1:]) + "\n"
+        content = "\n".join(lines[1:]) + "\n"
-    subtitles.append({
+        subtitles.append(
-      'start': start,
+            {
-      'end': end,
+                "start": start,
-      'content': (content.replace("-\n", "\n").replace("</u>-\n", "</u>\n").replace("-", " ").replace("%", " ").replace("<u> "," <u>").replace(" </u>","</u> ").replace("<u> </u>","").replace("<u></u>","").replace(" \n", "\n"))[:-1]
+                "end": end,
-    })
+                "content": (
-  # def sanitizevttwordlevel(subtitles):
+                    content.replace("-\n", "\n")
-  #   errorwords = []
+                    .replace("</u>-\n", "</u>\n")
-  #   newords = {}
+                    .replace("-", " ")
-  #   for subtitle in subtitles:
+                    .replace("%", " ")
-  #     for word in subtitle["content"].split(" "):
+                    .replace("<u> ", " <u>")
-  #       if ("<u>" in word):
+                    .replace(" </u>", "</u> ")
-  #         newword = None
+                    .replace("<u> </u>", "")
-  #         if (len(word.split("<u>")) > 1):
+                    .replace("<u></u>", "")
-  #           newword = word.replace("<u>", " <u>")
+                    .replace(" \n", "\n")
-  #         if (len(word.split("</u>")) > 1):
+                )[:-1],
-  #           newword = word.replace("</u>", "</u> ")
+            }
-  #         if newword:
+        )
-  #           original = word.split("<u>")[1].split("</u>")[0]
+    # def sanitizevttwordlevel(subtitles):
-  #           if original in errorwords:
+    #   errorwords = []
-  #             for i in errorwords[original]:
+    #   newords = {}
    #   for subtitle in subtitles:
    #     for word in subtitle["content"].split(" "):
    #       if ("<u>" in word):
    #         newword = None
    #         if (len(word.split("<u>")) > 1):
    #           newword = word.replace("<u>", " <u>")
    #         if (len(word.split("</u>")) > 1):
    #           newword = word.replace("</u>", "</u> ")
    #         if newword:
    #           original = word.split("<u>")[1].split("</u>")[0]
    #           if original in errorwords:
    #             for i in errorwords[original]:
-
+    #           else:
-  #           else:
+    #             errorwords[orig].append(word)
  #             errorwords[orig].append(word)
    #   error = False
    #   if "<u>" in subtitle["content"]:
@ -63,347 +77,386 @@ def from_vtt(vtt_string):
    #     for word in subtitle["content"].split(" "):
    #       if word.replace("<u>")
-  #   for subtitle in subtitles:
+    #   for subtitle in subtitles:
-  #     for words in subtitle["content"].split(" "):
+    #     for words in subtitle["content"].split(" "):
-  #       if word in errorwords:
+    #       if word in errorwords:
-  #         subtitle["content"]
+    #         subtitle["content"]
    # sanitizevttwordlevel(subtitles)
    return subtitles
  # sanitizevttwordlevel(subtitles)
  return subtitles
 def to_vtt(subtitles):
    vtt_content = "WEBVTT\n\n\n"
    for idx, subtitle in enumerate(subtitles):
-        content = subtitle['content']
+        content = subtitle["content"]
        if not subtitle.get("split", False):
-          start = subtitle['start']
+            start = subtitle["start"]
-          end = subtitle['end']
+            end = subtitle["end"]
-          vtt_content += f"{start} --> {end}\n{content}\n\n\n"
+            vtt_content += f"{start} --> {end}\n{content}\n\n\n"
        else:
-          vtt_content += f"NOTE {content}\n\n\n"
+            vtt_content += f"NOTE {content}\n\n\n"
    return vtt_content.strip()
 def to_stacked_vtt(subtitles):
-  vtt_content = "WEBVTT\n\n\n"
+    vtt_content = "WEBVTT\n\n\n"
-  buffer = ""
+    buffer = ""
-  for subtitle in subtitles:
+    for subtitle in subtitles:
-    if subtitle.get("split", False):
+        if subtitle.get("split", False):
-      buffer = ""
+            buffer = ""
-      continue
+            continue
-    if len(buffer) != 0:
+        if len(buffer) != 0:
-      if str(subtitle['content'].strip())[-1] == ".":
+            if str(subtitle["content"].strip())[-1] == ".":
-        buffer += "\n"
+                buffer += "\n"
-      else:
+            else:
-        buffer += " "
+                buffer += " "
-    buffer += subtitle['content'].strip()
+        buffer += subtitle["content"].strip()
-    vtt_content += f"{subtitle['start']} --> {subtitle['end']}\n"
+        vtt_content += f"{subtitle['start']} --> {subtitle['end']}\n"
-    vtt_content += buffer
+        vtt_content += buffer
-    vtt_content += "\n\n\n"
+        vtt_content += "\n\n\n"
-  return vtt_content
+    return vtt_content
 def script_from_word_vtt(wordvtt):
-  subtitles = from_vtt(wordvtt)
+    subtitles = from_vtt(wordvtt)
-  print(f"VTT {len(subtitles)} lines. Generating script file from VTT.")
+    print(f"VTT {len(subtitles)} lines. Generating script file from VTT.")
-  sentences = []
+    sentences = []
-  EXCEPTION_FLAG, ADD_NEXT_SENTENCE = "", 0
+    EXCEPTION_FLAG, ADD_NEXT_SENTENCE = "", 0
-  for n, subtitle in enumerate(subtitles):
+    for n, subtitle in enumerate(subtitles):
-    sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
+        sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
-    if ((sentences[-1] if sentences else None) != sentence) or ADD_NEXT_SENTENCE:
+        if ((sentences[-1] if sentences else None) != sentence) or ADD_NEXT_SENTENCE:
-      sentences.append(sentence)
+            sentences.append(sentence)
-    ADD_NEXT_SENTENCE = 0
+        ADD_NEXT_SENTENCE = 0
-    if subtitle["content"][-4:] == "</u>":
+        if subtitle["content"][-4:] == "</u>":
-      # print(f"{len(sentences)} END {subtitle["content"]}")
+            # print(f"{len(sentences)} END {subtitle["content"]}")
-      ADD_NEXT_SENTENCE = 1
+            ADD_NEXT_SENTENCE = 1
-      if n + 2 < len(subtitles):
+            if n + 2 < len(subtitles):
-        if subtitles[n+2]["content"].replace("<u>", "").replace("</u>", "") != sentence:
+                if (
-          ADD_NEXT_SENTENCE = 0
+                    subtitles[n + 2]["content"].replace("<u>", "").replace("</u>", "")
-  return sentences
+                    != sentence
                ):
                    ADD_NEXT_SENTENCE = 0
    return sentences
 def create_word_scenes(wordvtt, scriptraw):
-  subtitles = from_vtt(wordvtt)
+    subtitles = from_vtt(wordvtt)
-  scripts   = [i for i in scriptraw.split("\n") if i]
+    scripts = [i for i in scriptraw.split("\n") if i]
-  print(f"VTT {len(subtitles)} lines, Script {len(scripts)} lines")
+    print(f"VTT {len(subtitles)} lines, Script {len(scripts)} lines")
-  scenes = []
+    scenes = []
-  for n, script in enumerate(scripts):
+    for n, script in enumerate(scripts):
-    if len(script.split(" ")) == 1:
+        if len(script.split(" ")) == 1:
-      continue
+            continue
-    scenes.append({"scene": script, "timestamp": []})
+        scenes.append({"scene": script, "timestamp": []})
-  scenes_cur = 0
+    scenes_cur = 0
-  for n, subtitle in enumerate(subtitles):
+    for n, subtitle in enumerate(subtitles):
-    sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
+        sentence = subtitle["content"].replace("<u>", "").replace("</u>", "")
-    if len(sentence.split(" ")) == 1:
+        if len(sentence.split(" ")) == 1:
-      continue
+            continue
-    if sentence != scenes[scenes_cur].get("scene"):
+        if sentence != scenes[scenes_cur].get("scene"):
-      if sentence == scenes[scenes_cur+1].get("scene"):
+            if sentence == scenes[scenes_cur + 1].get("scene"):
-        scenes_cur += 1
+                scenes_cur += 1
-      else:
+            else:
-        print(f"Error, Mismatch in scenes\n=>\"[{scenes_cur}] {scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\"")
+                print(
-        return
+                    f"Error, Mismatch in scenes\n=>\"[{scenes_cur}] {scenes[scenes_cur].get("scene")}\" or \"[{scenes_cur+1}] {scenes[scenes_cur+1].get("scene")}\" != \"{sentence}\""
                )
                return
-    current_scene = scenes[scenes_cur]
+        current_scene = scenes[scenes_cur]
-    if current_scene["timestamp"]:
+        if current_scene["timestamp"]:
-      word_idx = current_scene["timestamp"][-1]["index"] + 1
+            word_idx = current_scene["timestamp"][-1]["index"] + 1
-    else:
+        else:
-      word_idx = 0
+            word_idx = 0
-    # print(scenes_cur, subtitle, word_idx, sentence)
+        # print(scenes_cur, subtitle, word_idx, sentence)
-    if ("<u>" not in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
+        if ("<u>" not in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
-      pass
+            pass
-    if ("<u>" in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
+        if ("<u>" in subtitle["content"]) and word_idx >= len(sentence.split(" ")):
-      print(f"Error, index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}")
+            print(
-      word_idx = 0
+                f"Error, index wrong. {scenes_cur}, word: {word_idx}, total words: {len(sentence.split(" "))}\n{subtitle}"
-      scenes_cur += 1
+            )
-      current_scene = scenes[scenes_cur]
+            word_idx = 0
-      if current_scene["timestamp"]:
+            scenes_cur += 1
-        word_idx = current_scene["timestamp"][-1]["index"] + 1
+            current_scene = scenes[scenes_cur]
-      else:
+            if current_scene["timestamp"]:
-        word_idx = 0
+                word_idx = current_scene["timestamp"][-1]["index"] + 1
-      print(f"Changed to {word_idx}, {scenes_cur}")
+            else:
                word_idx = 0
            print(f"Changed to {word_idx}, {scenes_cur}")
-    if "<u>" in subtitle["content"]:
+        if "<u>" in subtitle["content"]:
-      # print(subtitle["content"])
+            # print(subtitle["content"])
-      word = subtitle["content"].split("<u>")[1].split("</u>")[0]
+            word = subtitle["content"].split("<u>")[1].split("</u>")[0]
-      if word not in sentence.split(" "):
+            if word not in sentence.split(" "):
-        print(f"Error, Mismatch\n=> \"{word}\" not in \"{sentence}\"")
+                print(f'Error, Mismatch\n=> "{word}" not in "{sentence}"')
-        return
+                return
-      try:
+            try:
-        assert sentence.split(" ")[word_idx] == word
+                assert sentence.split(" ")[word_idx] == word
-      except:
+            except:
-        print(f"Error, Mismatch\n=> \"{word}\" != [{word_idx}] of \"{sentence}\"")
+                print(f'Error, Mismatch\n=> "{word}" != [{word_idx}] of "{sentence}"')
-        return
+                return
-      word_time = {"start": subtitle["start"], "end": subtitle["end"], "index": word_idx, "word": word}
+            word_time = {
-      current_scene["timestamp"].append(word_time)
+                "start": subtitle["start"],
                "end": subtitle["end"],
                "index": word_idx,
                "word": word,
            }
            current_scene["timestamp"].append(word_time)
-  # print(json.dumps(scenes, indent=2))
+    # print(json.dumps(scenes, indent=2))
-  for scene in scenes:
+    for scene in scenes:
-    if len(scene["scene"].split(" ")) != len(scene["timestamp"]):
+        if len(scene["scene"].split(" ")) != len(scene["timestamp"]):
-      print("Error, Mismatch length")
+            print("Error, Mismatch length")
-      return
+            return
-    if "" in scene["scene"].split(" "):
+        if "" in scene["scene"].split(" "):
-      print(repr(scene["scene"]))
+            print(repr(scene["scene"]))
-  full_script, full_scenes = [], []
+    full_script, full_scenes = [], []
-  for scene in scenes:
+    for scene in scenes:
-    full_script += scene["scene"].split(" ")[:-1]
+        full_script += scene["scene"].split(" ")[:-1]
-    full_script.append(scene["scene"].split(" ")[-1]+"##")
+        full_script.append(scene["scene"].split(" ")[-1] + "##")
-    full_scenes += scene["timestamp"]
+        full_scenes += scene["timestamp"]
-  for i, j in zip(full_script, full_scenes):
+    for i, j in zip(full_script, full_scenes):
-    if i.replace("##", "") != j["word"]:
+        if i.replace("##", "") != j["word"]:
-      print("Error, Mismatch")
+            print("Error, Mismatch")
-      return
+            return
    assert len(full_scenes) == len(full_script)
    return full_script, full_scenes
  assert len(full_scenes) == len(full_script)
  return full_script, full_scenes
 # Detect long break or change in context, inserts section break into script.
 def autobreak(lines, times):
-  from datetime import timedelta
+    from datetime import timedelta
-  def parsetime(time_str):
+    def parsetime(time_str):
-    minutes, seconds = time_str.split(':')
+        minutes, seconds = time_str.split(":")
-    seconds, milliseconds = seconds.split('.')
+        seconds, milliseconds = seconds.split(".")
-    td = timedelta(minutes=int(minutes), seconds=int(seconds), milliseconds=int(milliseconds))
+        td = timedelta(
-    return td
+            minutes=int(minutes), seconds=int(seconds), milliseconds=int(milliseconds)
        )
        return td
-  script = []
+    script = []
-  long_breaks = []
+    long_breaks = []
-  tmark = parsetime("0:0.0")
+    tmark = parsetime("0:0.0")
-  for i, j in zip(lines, times):
+    for i, j in zip(lines, times):
-    tdiff = parsetime(j["start"]) - tmark
+        tdiff = parsetime(j["start"]) - tmark
-    tmark = parsetime(j["end"])
+        tmark = parsetime(j["end"])
-    if tdiff > parsetime("0:0.0"):
+        if tdiff > parsetime("0:0.0"):
-      long_breaks.append(tdiff)
+            long_breaks.append(tdiff)
-      # print()
+            # print()
-    # print(i, end=" ")
+        # print(i, end=" ")
-  # print()
+    # print()
-  mean_break = parsetime("0:0.0")
+    mean_break = parsetime("0:0.0")
-  for i in long_breaks:
+    for i in long_breaks:
-    mean_break += i/len(long_breaks)
+        mean_break += i / len(long_breaks)
-  print(mean_break)
+    print(mean_break)
-  script = ""
+    script = ""
-  tmark = parsetime("0:0.0")
+    tmark = parsetime("0:0.0")
-  tmp = " "
+    tmp = " "
-  continous_line = 0
+    continous_line = 0
-  for i, j in zip(lines, times):
+    for i, j in zip(lines, times):
-    tdiff = parsetime(j["start"]) - tmark
+        tdiff = parsetime(j["start"]) - tmark
-    tmark = parsetime(j["end"])
+        tmark = parsetime(j["end"])
-    if tdiff > mean_break and tmp[-1] != ".":
+        if tdiff > mean_break and tmp[-1] != ".":
-      script += "\n"
+            script += "\n"
-    if (tdiff >= mean_break and tmp[-1] == "."):
+        if tdiff >= mean_break and tmp[-1] == ".":
-        script += "\n"
+            script += "\n"
-        continous_line = 0
+            continous_line = 0
-    else:
+        else:
-      continous_line += 1
+            continous_line += 1
-    script += i.replace("##", "")
+        script += i.replace("##", "")
        if i[-1] == ".":
            script += "\n"
        elif "##" in i:
            script += "\n"
        else:
            script += " "
        tmp = i
    return script
    if i[-1] == ".":
      script += "\n"
    elif "##" in i:
        script += "\n"
    else:
      script += " " 
    tmp = i
  return script
 def scene_from_new_script(raw_script, full_script, full_scenes):
-  mod_script = raw_script.replace("\n", " \n ").split(" ")
+    mod_script = raw_script.replace("\n", " \n ").split(" ")
-  mod_script = [i for i in mod_script if i]
+    mod_script = [i for i in mod_script if i]
-  n = 0
+    n = 0
-  while True:
+    while True:
-    if mod_script[n] == "\n":
+        if mod_script[n] == "\n":
-      mod_script[n-1] += "\n"
+            mod_script[n - 1] += "\n"
-      del(mod_script[n])
+            del mod_script[n]
-      n -= 1
+            n -= 1
-    n += 1
+        n += 1
-    if n == len(mod_script):
+        if n == len(mod_script):
-      break
+            break
-  # print(mod_script)
+    # print(mod_script)
-  print(f"Original: {len(full_script)}, Modded: {len(mod_script)}")
+    print(f"Original: {len(full_script)}, Modded: {len(mod_script)}")
-  allowed_list = [".", "\n", "\n\n", ",", "?", "##"]
+    allowed_list = [".", "\n", "\n\n", ",", "?", "##"]
    def normalized(x):
        for i in allowed_list:
            x = x.replace(i, "")
        return x.upper()
    same = lambda a, b: normalized(a) == normalized(b)
    new_script, new_timestamp, orig_index, n = [], [], 0, 0
    fail = 0
    while n < len(mod_script):
        print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}")
        word = mod_script[n]
        if same(word, full_script[orig_index].replace("##", "")):
            cur = full_scenes[orig_index]
            new_script.append(word.replace("##", ""))
            new_timestamp.append({"start": cur["start"], "end": cur["end"]})
            fail = 0
        else:
            if fail > 10:
                print("Error: Failed to match words,")
                return
            # print("Back")
            fail += 1
            n -= 1
        n, orig_index = n + 1, orig_index + 1
    assert len(new_script) == len(new_timestamp)
    return new_script, new_timestamp
  def normalized(x):
    for i in allowed_list:
      x = x.replace(i, "")
    return x.upper()
  same = lambda a, b: normalized(a) == normalized(b)
  new_script, new_timestamp, orig_index, n = [], [], 0, 0
  fail = 0
  while n < len(mod_script):
    print(f"{repr(mod_script[n]):>20} ? {repr(full_script[orig_index])}")
    word = mod_script[n]
    if same(word, full_script[orig_index].replace("##", "")):
      cur = full_scenes[orig_index]
      new_script.append(word.replace("##", ""))
      new_timestamp.append({"start": cur["start"], "end": cur["end"]})
      fail = 0
    else:
      if fail > 10:
        print("Error: Failed to match words,")
        return
      # print("Back")
      fail += 1
      n -= 1
    n, orig_index = n+1, orig_index+1
  assert len(new_script) == len(new_timestamp)
  return new_script, new_timestamp
 def build_new_subtitle(new_script, new_timestamp):
-  buffer, new_scenes, start, end = [], [], None, None
+    buffer, new_scenes, start, end = [], [], None, None
-  current_scene = []
+    current_scene = []
-  # print(" ".join(new_script).split("\n"))
+    # print(" ".join(new_script).split("\n"))
-  for i, j in zip(new_script, new_timestamp):
+    for i, j in zip(new_script, new_timestamp):
-    if "\n" in i:
+        if "\n" in i:
-      buffer.append(i.replace("\n", ""))
+            buffer.append(i.replace("\n", ""))
-      current_scene.append({"content": " ".join(buffer).replace("##", ""), "start": start, "end": j["end"]})
+            current_scene.append(
-      buffer, start = [], None
+                {
-      if "\n\n" in i:
+                    "content": " ".join(buffer).replace("##", ""),
-        print(f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\"")
+                    "start": start,
                    "end": j["end"],
                }
            )
            buffer, start = [], None
            if "\n\n" in i:
                print(
                    f"Section break at line #{len(current_scene):<3}| \"{current_scene[-1]["content"]}\""
                )
                new_scenes.append(current_scene)
                current_scene = []
        else:
            buffer.append(i)
            if not start:
                start = j["start"]
    if start:
        buffer.append(i.replace("\n", ""))
        current_scene.append(
            {"content": " ".join(buffer), "start": start, "end": j["end"]}
        )
    if current_scene != (new_scenes[-1] if new_scenes else None):
        new_scenes.append(current_scene)
        current_scene = []
    else:
      buffer.append(i)
      if not start:
        start = j["start"]
-  if start:
+    # print("\n\n".join(["\n".join([j["content"] for j in i]) for i in new_scenes]))
-      buffer.append(i.replace("\n", ""))
+    newsub = []
-      current_scene.append({"content": " ".join(buffer), "start": start, "end": j["end"]})
+    for n, i in enumerate(new_scenes):
        newsub += i
        if n < len(new_scenes) - 1:
            newsub.append(
                {"content": "Break", "start": None, "end": None, "split": True}
            )
-  if current_scene != (new_scenes[-1] if new_scenes else None):
+    return newsub
    new_scenes.append(current_scene)
  # print("\n\n".join(["\n".join([j["content"] for j in i]) for i in new_scenes]))
  newsub = []
  for n, i in enumerate(new_scenes):
    newsub += i
    if n < len(new_scenes) - 1:
      newsub.append({"content": "Break", "start": None, "end": None, "split": True})
-  return newsub
+def saveFile(filename, data, override=False):
    if os.path.exists(filename) and not override:
        print(f"File {filename} already exists.")
        return -1
    with open(filename, "w") as f:
        f.write(data)
 def saveFile(filename, data, override = False):
  if os.path.exists(filename) and not override:
    print(f"File {filename} already exists.")
    return -1
  with open(filename, "w") as f:
    f.write(data)
 def openFile(filename):
-  with open(filename, "r") as f:
+    with open(filename, "r") as f:
-    data = f.read()
+        data = f.read()
-  if not data:
+    if not data:
-    return -1
+        return -1
-  return data
+    return data
 def main(vttfile, scriptfile):
-  modfile = ".".join(scriptfile.split(".")[:-1]) + ".script"
+    modfile = ".".join(scriptfile.split(".")[:-1]) + ".script"
-  x = create_word_scenes(openFile(vttfile), openFile(scriptfile))
+    x = create_word_scenes(openFile(vttfile), openFile(scriptfile))
  if not x:
    sys.exit(-1)
  full_script, full_scenes = x
  if not os.path.exists(modfile):
    genscript = autobreak(full_script,full_scenes)
    saveFile(modfile, genscript)
    print(f"Saved modification file as {modfile}. Modify it and return back.")
  else:
    x = scene_from_new_script(openFile(modfile), full_script, full_scenes)
    if not x:
-      sys.exit(-1)
+        sys.exit(-1)
-    a, b = x
+    full_script, full_scenes = x
-    final_vtt = build_new_subtitle(a, b)
+    if not os.path.exists(modfile):
-    jsonfile = ".".join(vttfile.split(".")[:-1]) + ".json"
+        genscript = autobreak(full_script, full_scenes)
-    saveFile(jsonfile, json.dumps(final_vtt, indent=2), True)
+        saveFile(modfile, genscript)
-    print(f"Saved JSON file as {jsonfile}. Fix it, and convert it to VTT.")
+        print(f"Saved modification file as {modfile}. Modify it and return back.")
    else:
        x = scene_from_new_script(openFile(modfile), full_script, full_scenes)
        if not x:
            sys.exit(-1)
        a, b = x
-if __name__=="__main__":
+        final_vtt = build_new_subtitle(a, b)
-  import sys
+        jsonfile = ".".join(vttfile.split(".")[:-1]) + ".json"
-  if len(sys.argv) not in (2, 3):
+        saveFile(jsonfile, json.dumps(final_vtt, indent=2), True)
-    print(f"Usage: {sys.argv[0].split("/")[-1]} [vtt file] (txt file)\n"                                  \
+        print(f"Saved JSON file as {jsonfile}. Fix it, and convert it to VTT.")
     f"       {sys.argv[0].split("/")[-1]} [JSON file]\n"                                                 \
      "** Only output from openai-whisper with '--word-timestamp true' is accepted.)\n"                   \
      "** You have to run this for first time, and then fix .script file, and then re-run this script.\n" \
      "** Adding newline/period/commas are onlt permitted. Fix else in JSON file.")
    sys.exit()
-  vtt = sys.argv[1]
+
-  print(f"\n[{vtt}]")
+if __name__ == "__main__":
-  if len(sys.argv) == 3:
+    import sys
-    script = sys.argv[2]
+
-    if (not os.path.exists(vtt)) or (not os.path.exists(script)):
+    if len(sys.argv) not in (2, 3):
-      print(f"Input file doesnt exists.")
+        print(
-      sys.exit(-1)
+            f"Usage: {sys.argv[0].split("/")[-1]} [vtt file] (txt file)\n"
-    main(vtt, script)
+            f"       {sys.argv[0].split("/")[-1]} [JSON file]\n"
-  else:
+            "** Only output from openai-whisper with '--word-timestamp true' is accepted.)\n"
-    if ".json" in vtt:
+            "** You have to run this for first time, and then fix .script file, and then re-run this script.\n"
-      final_vtt = json.loads(openFile(vtt))
+            "** Adding newline/period/commas are onlt permitted. Fix else in JSON file."
-      orgf = ".".join(vtt.split(".")[:-1])
+        )
-      print(f"Saved VTT file as {orgf}.final.vtt.")
+        sys.exit()
-      saveFile(orgf + ".final.vtt", to_vtt(final_vtt), True)
+
-      saveFile(orgf + ".stacked.vtt", to_stacked_vtt(final_vtt), True)
+    vtt = sys.argv[1]
-      sys.exit(0)
+    print(f"\n[{vtt}]")
-    if (not os.path.exists(vtt)):
+    if len(sys.argv) == 3:
-      print(f"Input file doesnt exists.")
+        script = sys.argv[2]
-      sys.exit(-1)
+        if (not os.path.exists(vtt)) or (not os.path.exists(script)):
-    script = ".".join(vtt.split(".")[:-1]) + ".txt"
+            print(f"Input file doesnt exists.")
-    saveFile(script, "\n".join(script_from_word_vtt(openFile(vtt))))
+            sys.exit(-1)
-    main(vtt, script)
+        main(vtt, script)
    else:
        if ".json" in vtt:
            final_vtt = json.loads(openFile(vtt))
            orgf = ".".join(vtt.split(".")[:-1])
            print(f"Saved VTT file as {orgf}.final.vtt.")
            saveFile(orgf + ".final.vtt", to_vtt(final_vtt), True)
            saveFile(orgf + ".stacked.vtt", to_stacked_vtt(final_vtt), True)
            sys.exit(0)
        if not os.path.exists(vtt):
            print(f"Input file doesnt exists.")
            sys.exit(-1)
        script = ".".join(vtt.split(".")[:-1]) + ".txt"
        saveFile(script, "\n".join(script_from_word_vtt(openFile(vtt))))
        main(vtt, script)