Skip to content

Commit a0a9e99

Browse files
committed
include optional word time codes in forced alignment and transcription
1 parent 6ce24da commit a0a9e99

File tree

13 files changed

+293
-62
lines changed

13 files changed

+293
-62
lines changed

README.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,13 +104,16 @@ $ subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/sub
104104
$ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf small -o subtitle_aligned.srt
105105
$ subaligner -m transcribe -v video.mp4 -ml zho -mr whisper -mf medium -o subtitle_aligned.srt
106106
$ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf turbo -ip "your initial prompt" -o subtitle_aligned.srt
107+
$ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf turbo -ip "your initial prompt" --word_time_codes -o raw_subtitle.json
107108
$ subaligner -m transcribe -v video.mp4 -s subtitle.srt -ml eng -mr whisper -mf turbo -o subtitle_aligned.srt
108-
$ subaligner -m transcribe -v video.mp4 -s subtitle.srt -upp -ml eng -mr whisper -mf turbo -o subtitle_aligned.srt
109+
$ subaligner -m transcribe -v video.mp4 -s subtitle.srt --use_prior_prompting -ml eng -mr whisper -mf turbo -o subtitle_aligned.srt
110+
109111
```
110112
```
111113
# Alignment on segmented plain texts (double newlines as the delimiter)
112114
113115
$ subaligner -m script -v video.mp4 -s subtitle.txt -o subtitle_aligned.srt
116+
$ subaligner -m script -v video.mp4 -s subtitle.txt --word_time_codes -o raw_subtitle.json
114117
$ subaligner -m script -v https://example.com/video.mp4 -s https://example.com/subtitle.txt -o subtitle_aligned.srt
115118
```
116119
```
@@ -175,7 +178,9 @@ $ docker run -v `pwd`:`pwd` -w `pwd` -it baxtree/subaligner subaligner -m dual -
175178
$ docker run -it baxtree/subaligner subaligner -m single -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt
176179
$ docker run -it baxtree/subaligner subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt
177180
```
178-
The aligned subtitle will be saved at `subtitle_aligned.srt`. For details on CLIs, run `subaligner -h` or `subaligner_batch -h`, `subaligner_convert -h`, `subaligner_train -h` and `subaligner_tune -h` for additional utilities. `subaligner_1pass` and `subaligner_2pass` are shortcuts for running `subaligner` with `-m single` and `-m dual` options, respectively.
181+
The aligned subtitle will be saved at `subtitle_aligned.srt`. To obtain the subtitle in raw JSON format for downstream
182+
processing, replace the output file extension with `.json`. For details on CLIs, run `subaligner -h` or `subaligner_batch -h`,
183+
`subaligner_convert -h`, `subaligner_train -h` and `subaligner_tune -h` for additional utilities. `subaligner_1pass` and `subaligner_2pass` are shortcuts for running `subaligner` with `-m single` and `-m dual` options, respectively.
179184

180185
![](figures/screencast.gif)
181186

site/source/usage.rst

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,14 @@ Make sure you have got the virtual environment activated upfront.
2727
(.venv) $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf small -o subtitle_aligned.srt
2828
(.venv) $ subaligner -m transcribe -v video.mp4 -ml zho -mr whisper -mf medium -o subtitle_aligned.srt
2929
(.venv) $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf turbo -ip "your initial prompt" -o subtitle_aligned.srt
30+
(.venv) $ subaligner -m transcribe -v video.mp4 -ml eng -mr whisper -mf turbo -ip "your initial prompt" --word_time_codes -o raw_subtitle.json
3031
(.venv) $ subaligner -m transcribe -v video.mp4 -s subtitle.srt -ml eng -mr whisper -mf turbo -o subtitle_aligned.srt
31-
(.venv) $ subaligner -m transcribe -v video.mp4 -s subtitle.srt -upp -ml eng -mr whisper -mf turbo -o subtitle_aligned.srt
32+
(.venv) $ subaligner -m transcribe -v video.mp4 -s subtitle.srt --use_prior_prompting -ml eng -mr whisper -mf turbo -o subtitle_aligned.srt
3233

3334
**Alignment on segmented plain texts (double newlines as the delimiter)**::
3435

3536
(.venv) $ subaligner -m script -v video.mp4 -s subtitle.txt -o subtitle_aligned.srt
37+
(.venv) $ subaligner -m script -v video.mp4 -s subtitle.txt --word_time_codes -o raw_subtitle.json
3638
(.venv) $ subaligner -m script -v https://example.com/video.mp4 -s https://example.com/subtitle.txt -o subtitle_aligned.srt
3739

3840
**Alignment on multiple subtitles against the single media file**::
@@ -80,6 +82,11 @@ Make sure you have got the virtual environment activated upfront.
8082
$ docker run -it baxtree/subaligner subaligner -m single -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt
8183
$ docker run -it baxtree/subaligner subaligner -m dual -v https://example.com/video.mp4 -s https://example.com/subtitle.srt -o subtitle_aligned.srt
8284

85+
The aligned subtitle will be saved at `subtitle_aligned.srt`. To obtain the subtitle in raw JSON format for downstream
86+
processing, replace the output file extension with `.json`. For details on CLIs, run `subaligner -h` or `subaligner_batch -h`,
87+
`subaligner_convert -h`, `subaligner_train -h` and `subaligner_tune -h` for additional utilities. `subaligner_1pass` and
88+
`subaligner_2pass` are shortcuts for running `subaligner` with `-m single` and `-m dual` options, respectively.
89+
8390
**Run alignments with pipx**::
8491

8592
$ pipx run subaligner -m single -v video.mp4 -s subtitle.srt

subaligner/__main__.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,14 @@
55
[-fos] [-tod TRAINING_OUTPUT_DIRECTORY] [-o OUTPUT] [-t TRANSLATE] [-os OFFSET_SECONDS]
66
[-ml {afr,amh,ara,arg,asm,aze,ben,bos,bul,cat,ces,cmn,cym,dan,deu,ell,eng,epo,est,eus,fas,fin,fra,gla,gle,glg,grc,grn,guj,heb,hin,hrv,hun,hye,ina,ind,isl,ita,jbo,jpn,kal,kan,kat,kir,kor,kur,lat,lav,lfn,lit,mal,mar,mkd,mlt,msa,mya,nah,nep,nld,nor,ori,orm,pan,pap,pol,por,ron,rus,sin,slk,slv,spa,sqi,srp,swa,swe,tam,tat,tel,tha,tsn,tur,ukr,urd,vie,yue,zho}]
77
[-mr {whisper}] [-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large,turbo}] [-ip INITIAL_PROMPT] [-mcl MAX_CHAR_LENGTH]
8-
[-tr {helsinki-nlp,whisper,facebook-mbart,facebook-m2m100}] [-tf TRANSLATION_FLAVOUR] [-mpt MEDIA_PROCESS_TIMEOUT] [-sat SEGMENT_ALIGNMENT_TIMEOUT] [-wt] [-upp] [-lgs] [-d] [-q] [-ver]
8+
[-tr {helsinki-nlp,whisper,facebook-mbart,facebook-m2m100}] [-tf TRANSLATION_FLAVOUR] [-mpt MEDIA_PROCESS_TIMEOUT] [-sat SEGMENT_ALIGNMENT_TIMEOUT] [-upp] [-wtc] [-lgs] [-d] [-q] [-ver]
99
1010
Subaligner command line interface
1111
1212
optional arguments:
1313
-h, --help show this help message and exit
1414
-s SUBTITLE_PATH [SUBTITLE_PATH ...], --subtitle_path SUBTITLE_PATH [SUBTITLE_PATH ...]
15-
File path or URL to the subtitle file (Extensions of supported subtitles: .vtt, .srt, .ass, .sbv, .sub, .txt, .ttml, .ssa, .dfxp, .ytt, .stl, .tmp, .smi, .scc, .sami, .xml) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)
15+
File path or URL to the subtitle file (Extensions of supported subtitles: .ssa, .tmp, .srt, .sbv, .stl, .json, .sami, .ttml, .smi, .txt, .scc, .sub, .ass, .vtt, .xml, .ytt, .dfxp) or selector for the embedded subtitle (e.g., embedded:page_num=888 or embedded:stream_index=0)
1616
-l MAX_LOGLOSS, --max_logloss MAX_LOGLOSS
1717
Max global log loss for alignment
1818
-so, --stretch_on Switch on stretch on subtitles)
@@ -48,6 +48,8 @@
4848
Maximum waiting time in seconds when aligning each segment
4949
-upp, --use_prior_prompting
5050
Whether to use the previous subtitle cue as the current prompt.
51+
-wtc, --word_time_codes
52+
Whether to output time codes for each word in the subtitle file.
5153
-lgs, --languages Print out language codes used for stretch and translation
5254
-d, --debug Print out debugging information
5355
-q, --quiet Switch off logging information
@@ -231,6 +233,8 @@ def main():
231233
)
232234
parser.add_argument("-upp", "--use_prior_prompting", action="store_true",
233235
help="Whether to use the previous subtitle cue as the current prompt.")
236+
parser.add_argument("-wtc", "--word_time_codes", action="store_true",
237+
help="Whether to output time codes for each word in the subtitle file.")
234238
parser.add_argument("-lgs", "--languages", action="store_true",
235239
help="Print out language codes used for stretch and translation")
236240
parser.add_argument("-d", "--debug", action="store_true",
@@ -364,6 +368,7 @@ def main():
364368
video_file_path=local_video_path,
365369
subtitle_file_path=local_subtitle_path,
366370
stretch_in_lang=stretch_in_lang,
371+
with_word_time_codes=FLAGS.word_time_codes,
367372
)
368373
elif FLAGS.mode == "transcribe":
369374
from subaligner.transcriber import Transcriber
@@ -372,13 +377,15 @@ def main():
372377
subtitle, frame_rate = transcriber.transcribe(video_file_path=local_video_path,
373378
language_code=stretch_in_lang,
374379
initial_prompt=FLAGS.initial_prompt,
375-
max_char_length=FLAGS.max_char_length)
380+
max_char_length=FLAGS.max_char_length,
381+
with_word_time_codes=FLAGS.word_time_codes)
376382
else:
377383
subtitle, frame_rate = transcriber.transcribe_with_subtitle_as_prompts(video_file_path=local_video_path,
378384
subtitle_file_path=local_subtitle_path,
379385
language_code=stretch_in_lang,
380386
max_char_length=FLAGS.max_char_length,
381-
use_prior_prompting=FLAGS.use_prior_prompting)
387+
use_prior_prompting=FLAGS.use_prior_prompting,
388+
with_word_time_codes=FLAGS.word_time_codes,)
382389
aligned_subs = subtitle.subs
383390
else:
384391
print("ERROR: Unknown mode {}".format(FLAGS.mode))

subaligner/predictor.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,13 +140,18 @@ def predict_dual_pass(
140140
if os.path.exists(audio_file_path):
141141
os.remove(audio_file_path)
142142

143-
def predict_plain_text(self, video_file_path: str, subtitle_file_path: str, stretch_in_lang: str = "eng") -> tuple:
143+
def predict_plain_text(self,
144+
video_file_path: str,
145+
subtitle_file_path: str,
146+
stretch_in_lang: str = "eng",
147+
with_word_time_codes: bool = False) -> tuple:
144148
"""Predict time to shift with plain texts
145149
146150
Arguments:
147151
video_file_path {string} -- The input video file path.
148152
subtitle_file_path {string} -- The path to the subtitle file.
149153
stretch_in_lang {str} -- The language used for stretching subtitles (default: {"eng"}).
154+
with_word_time_codes {bool} -- True to output time codes for each word (default: {False}).
150155
151156
Returns:
152157
tuple: The shifted subtitles, the audio file path (None) and the voice probabilities of the original audio (None).
@@ -178,9 +183,22 @@ def predict_plain_text(self, video_file_path: str, subtitle_file_path: str, stre
178183
runtime_config_string = "dtw_algorithm=stripe" # stripe or exact
179184
task = Task(config_string=task_config_string)
180185

186+
path = None
187+
if with_word_time_codes:
188+
_, path = tempfile.mkstemp()
189+
processed = []
190+
with open(subtitle_file_path, "r", encoding="utf-8") as f:
191+
for line in f.readlines():
192+
# TODO: Use tokenizers to process languages that do not use spaces as word delimiters
193+
processed.append((os.linesep * 2).join(line.strip().split()) if line.strip() else os.linesep)
194+
195+
with open(path, "w", encoding="utf-8") as f:
196+
f.write((os.linesep * 2).join(processed))
197+
f.flush()
198+
181199
try:
182200
task.audio_file_path_absolute = audio_file_path
183-
task.text_file_path_absolute = subtitle_file_path
201+
task.text_file_path_absolute = subtitle_file_path if not with_word_time_codes else path
184202
task.sync_map_file_path_absolute = "{}.srt".format(root)
185203

186204
tee = False if self.__LOGGER.level == getattr(logging, 'DEBUG') else True
@@ -205,7 +223,8 @@ def predict_plain_text(self, video_file_path: str, subtitle_file_path: str, stre
205223
try:
206224
frame_rate = self.__media_helper.get_frame_rate(video_file_path)
207225
self.__feature_embedder.step_sample = 1 / frame_rate
208-
self.__on_frame_timecodes(adjusted_subs)
226+
if not with_word_time_codes:
227+
self.__on_frame_timecodes(adjusted_subs)
209228
except NoFrameRateException:
210229
self.__LOGGER.warning("Cannot detect the frame rate for %s" % video_file_path)
211230

@@ -220,6 +239,8 @@ def predict_plain_text(self, video_file_path: str, subtitle_file_path: str, stre
220239
os.remove(task.audio_file_path_absolute)
221240
if task.sync_map_file_path_absolute is not None and os.path.exists(task.sync_map_file_path_absolute):
222241
os.remove(task.sync_map_file_path_absolute)
242+
if path is not None and os.path.exists(path):
243+
os.remove(path)
223244

224245
def get_log_loss(self, voice_probabilities: np.ndarray, subs: List[SubRipItem]) -> float:
225246
"""Returns a single loss value on voice prediction

subaligner/subtitle.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ class Subtitle(object):
4545
SCC_EXTENSIONS = [".scc"]
4646
SBV_EXTENSIONS = [".sbv"]
4747
YT_TRANSCRIPT_EXTENSIONS = [".ytt"]
48+
JSON_RAW_EXTENSIONS = [".json"]
4849

4950
def __init__(self, secret: object, subtitle_file_path: str, subtitle_format: str) -> None:
5051
assert (
@@ -81,6 +82,8 @@ def __init__(self, secret: object, subtitle_file_path: str, subtitle_format: str
8182
self.__subs = self.__convert_sbv_to_subs(subtitle_file_path)
8283
elif subtitle_format == "ytt":
8384
self.__subs = self.__convert_ytt_to_subs(subtitle_file_path)
85+
elif subtitle_format == "json":
86+
self.__subs = self.__convert_json_raw_to_subs(subtitle_file_path)
8487
else:
8588
raise UnsupportedFormatException(
8689
"Unknown subtitle format for file: {}".format(subtitle_file_path)
@@ -272,6 +275,19 @@ def load_ytt(cls, subtitle_file_path: str) -> "Subtitle":
272275

273276
return cls(cls.__secret, subtitle_file_path, "ytt")
274277

278+
@classmethod
279+
def load_json(cls, subtitle_file_path: str) -> "Subtitle":
280+
"""Load a JSON raw subtitle file.
281+
282+
Arguments:
283+
subtitle_file_path {string} -- The path to the subtitle file.
284+
285+
Returns:
286+
Subtitle: Subtitle object.
287+
"""
288+
289+
return cls(cls.__secret, subtitle_file_path, "json")
290+
275291
@classmethod
276292
def load(cls, subtitle_file_path: str) -> "Subtitle":
277293
"""Load a SubRip or TTML subtitle file based on the file extension.
@@ -310,6 +326,8 @@ def load(cls, subtitle_file_path: str) -> "Subtitle":
310326
return cls(cls.__secret, subtitle_file_path, "sbv")
311327
elif file_extension in cls.YT_TRANSCRIPT_EXTENSIONS:
312328
return cls(cls.__secret, subtitle_file_path, "ytt")
329+
elif file_extension in cls.JSON_RAW_EXTENSIONS:
330+
return cls(cls.__secret, subtitle_file_path, "json")
313331
else:
314332
return cls(cls.__secret, subtitle_file_path, "unknown")
315333

@@ -380,6 +398,8 @@ def shift_subtitle(
380398
subs = cls(cls.__secret, subtitle_file_path, "sbv").subs
381399
elif file_extension.lower() in cls.YT_TRANSCRIPT_EXTENSIONS:
382400
subs = cls(cls.__secret, subtitle_file_path, "ytt").subs
401+
elif file_extension.lower() in cls.JSON_RAW_EXTENSIONS:
402+
subs = cls(cls.__secret, subtitle_file_path, "json").subs
383403
else:
384404
raise UnsupportedFormatException(
385405
"Unknown subtitle format for file: {}".format(subtitle_file_path)
@@ -493,7 +513,7 @@ def subtitle_extensions() -> set:
493513
+ Subtitle.SSA_EXTENTIONS + Subtitle.ADVANCED_SSA_EXTENTIONS + Subtitle.MICRODVD_EXTENSIONS
494514
+ Subtitle.MPL2_EXTENSIONS + Subtitle.TMP_EXTENSIONS + Subtitle.SAMI_EXTENSIONS
495515
+ Subtitle.STL_EXTENSIONS + Subtitle.SCC_EXTENSIONS + Subtitle.SBV_EXTENSIONS
496-
+ Subtitle.YT_TRANSCRIPT_EXTENSIONS)
516+
+ Subtitle.YT_TRANSCRIPT_EXTENSIONS + Subtitle.JSON_RAW_EXTENSIONS)
497517

498518
@property
499519
def subtitle_file_path(self) -> str:
@@ -712,6 +732,21 @@ def __convert_ytt_to_subs(ytt_file_path: str) -> SubRipFile:
712732

713733
return Subtitle._get_srt_subs(path, housekeep=True)
714734

735+
@staticmethod
736+
def __convert_json_raw_to_subs(json_file_path: str) -> SubRipFile:
737+
"""Convert a subtitle file from the JSON raw format to the SubRip format
738+
739+
Arguments:
740+
json_file_path {string} -- The path to the JSON subtitle file.
741+
742+
Returns:
743+
SubRipFile: A list of SubRipItems.
744+
"""
745+
_, path = tempfile.mkstemp()
746+
Utils.json2srt(json_file_path, path)
747+
748+
return Subtitle._get_srt_subs(path, housekeep=True)
749+
715750
@staticmethod
716751
def __export_with_format(subs: List[SubRipItem], source_file_path: str, target_file_path: Optional[str], file_extension: str, suffix: str) -> None:
717752
if target_file_path is None:
@@ -833,6 +868,13 @@ def __save_subtitle_by_extension(file_extension: str,
833868
Utils.srt2ytt(path, target_file_path)
834869
finally:
835870
os.remove(path)
871+
elif file_extension in Subtitle.JSON_RAW_EXTENSIONS:
872+
try:
873+
_, path = tempfile.mkstemp()
874+
SubRipFile(subs).save(path, encoding=encoding)
875+
Utils.srt2json(path, target_file_path)
876+
finally:
877+
os.remove(path)
836878
else:
837879
raise UnsupportedFormatException(
838880
"Unknown subtitle format for file: {}".format(source_file_path)

0 commit comments

Comments
 (0)