speechbrain · mravanelli · Jul 4, 2022 · Jun 30, 2022 · Jul 1, 2022
diff --git a/.gitignore b/.gitignore
@@ -51,6 +51,7 @@ coverage.xml
 .hypothesis/
 .pytest_cache/
 cover/
+tests/tmp/
 
 # Translations
 *.mo
@@ -117,6 +118,9 @@ ENV/
 env.bak/
 venv.bak/
 
+# PyCharm project settings
+.idea
+
 # Spyder project settings
 .spyderproject
 .spyproject

diff --git a/recipes/CommonLanguage/lang_id/hparams/train_ecapa_tdnn.yaml b/recipes/CommonLanguage/lang_id/hparams/train_ecapa_tdnn.yaml
@@ -16,6 +16,7 @@ __set_seed: !apply:torch.manual_seed [!ref <seed>]
 data_folder: !PLACEHOLDER # e.g. /localscratch/common_voice_kpd/
 output_folder: !ref results/ECAPA-TDNN/<seed>
 save_folder: !ref <output_folder>/save
+rir_folder: !ref <data_folder>
 train_log: !ref <output_folder>/train_log.txt
 device: 'cuda:0'
 skip_prep: False
@@ -51,7 +52,7 @@ test_dataloader_options:
 # Added noise and reverb come from OpenRIR dataset, automatically
 # downloaded and prepared with this Environmental Corruption class.
 env_corrupt: !new:speechbrain.lobes.augment.EnvCorrupt
-    openrir_folder: !ref <data_folder>
+    openrir_folder: !ref <rir_folder>
     openrir_max_noise_len: 3.0  # seconds
     babble_prob: 0.0
     reverb_prob: 1.0

diff --git a/recipes/CommonVoice/ASR/seq2seq/hparams/train_en_with_wav2vec.yaml b/recipes/CommonVoice/ASR/seq2seq/hparams/train_en_with_wav2vec.yaml
@@ -106,7 +106,7 @@ wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
     source: !ref <wav2vec2_hub>
     output_norm: True
     freeze: !ref <freeze_wav2vec>
-    save_path: !ref <save_folder>/wav2vec2_checkpoints
+    save_path: !ref <save_folder>/wav2vec2_checkpoint
 
 #####
 # Uncomment this block if you prefer to use a Fairseq pretrained model instead

diff --git a/recipes/CommonVoice/ASR/seq2seq/hparams/train_rw_with_wav2vec.yaml b/recipes/CommonVoice/ASR/seq2seq/hparams/train_rw_with_wav2vec.yaml
@@ -106,7 +106,7 @@ wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
     source: !ref <wav2vec2_hub>
     output_norm: True
     freeze: !ref <freeze_wav2vec>
-    save_path: !ref <save_folder>/wav2vec2_checkpoints
+    save_path: !ref <save_folder>/wav2vec2_checkpoint
 
 #####
 # Uncomment this block if you prefer to use a Fairseq pretrained model instead

diff --git a/recipes/CommonVoice/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml b/recipes/CommonVoice/self-supervised-learning/wav2vec2/hparams/wav2vec2_base.yaml
@@ -93,7 +93,7 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
 
 wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2Pretrain
     source: !ref <wav2vec2_hub>
-    save_path: !ref <save_folder>/wav2vec2_checkpoints
+    save_path: !ref <save_folder>/wav2vec2_checkpoint
     mask_prob: !ref <mask_prob>
     mask_length: !ref <mask_length>
 

diff --git a/recipes/VoxLingua107/lang_id/hparams/train_ecapa.yaml b/recipes/VoxLingua107/lang_id/hparams/train_ecapa.yaml
@@ -10,7 +10,7 @@ output_folder: !ref results/epaca/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
 data_folder: ./
-
+rir_folder: !ref <data_folder>
 
 shards_url: /data/voxlingua107_shards
 train_meta: !ref <shards_url>/train/meta.json
@@ -79,7 +79,7 @@ augment_speed: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
 
 
 add_rev_noise: !new:speechbrain.lobes.augment.EnvCorrupt
-    openrir_folder: !ref <data_folder>
+    openrir_folder: !ref <rir_folder>
     openrir_max_noise_len: 3.0  # seconds
     reverb_prob: 0.5
     noise_prob: 0.8

diff --git a/speechbrain/dataio/dataio.py b/speechbrain/dataio/dataio.py
@@ -189,7 +189,7 @@ def read_audio(waveforms_obj):
     -------
     >>> dummywav = torch.rand(16000)
     >>> import os
-    >>> tmpfile = os.path.join(str(getfixture('tmpdir')),  "wave.wav")
+    >>> tmpfile = str(getfixture('tmpdir') / "wave.wav")
     >>> write_audio(tmpfile, dummywav, 16000)
     >>> asr_example = { "wav": tmpfile, "spk_id": "foo", "words": "foo bar"}
     >>> loaded = read_audio(asr_example["wav"])
@@ -257,7 +257,7 @@ def read_audio_multichannel(waveforms_obj):
     -------
     >>> dummywav = torch.rand(16000, 2)
     >>> import os
-    >>> tmpfile = os.path.join(str(getfixture('tmpdir')),  "wave.wav")
+    >>> tmpfile = str(getfixture('tmpdir') / "wave.wav")
     >>> write_audio(tmpfile, dummywav, 16000)
     >>> asr_example = { "wav": tmpfile, "spk_id": "foo", "words": "foo bar"}
     >>> loaded = read_audio(asr_example["wav"])
@@ -305,7 +305,7 @@ def write_audio(filepath, audio, samplerate):
     Example
     -------
     >>> import os
-    >>> tmpfile = os.path.join(str(getfixture('tmpdir')),  "wave.wav")
+    >>> tmpfile = str(getfixture('tmpdir') / "wave.wav")
     >>> dummywav = torch.rand(16000, 2)
     >>> write_audio(tmpfile, dummywav, 16000)
     >>> loaded = read_audio(tmpfile)
@@ -605,7 +605,7 @@ def write_txt_file(data, filename, sampling_rate=None):
     -------
     >>> tmpdir = getfixture('tmpdir')
     >>> signal=torch.tensor([1,2,3,4])
-    >>> write_txt_file(signal, os.path.join(tmpdir, 'example.txt'))
+    >>> write_txt_file(signal, tmpdir / 'example.txt')
     """
     del sampling_rate  # Not used.
     # Check if the path of filename exists
@@ -642,7 +642,7 @@ def write_stdout(data, filename=None, sampling_rate=None):
     -------
     >>> tmpdir = getfixture('tmpdir')
     >>> signal = torch.tensor([[1,2,3,4]])
-    >>> write_stdout(signal, tmpdir + '/example.txt')
+    >>> write_stdout(signal, tmpdir / 'example.txt')
     [1, 2, 3, 4]
     """
     # Managing Torch.Tensor
@@ -805,7 +805,7 @@ def save_md5(files, out_file):
     Example:
     >>> files = ['tests/samples/single-mic/example1.wav']
     >>> tmpdir = getfixture('tmpdir')
-    >>> save_md5(files, os.path.join(tmpdir, "md5.pkl"))
+    >>> save_md5(files, tmpdir / "md5.pkl")
     """
     # Initialization of the dictionary
     md5_dict = {}
@@ -830,7 +830,7 @@ def save_pkl(obj, file):
 
     Example
     -------
-    >>> tmpfile = os.path.join(getfixture('tmpdir'), "example.pkl")
+    >>> tmpfile = getfixture('tmpdir') / "example.pkl"
     >>> save_pkl([1, 2, 3, 4, 5], tmpfile)
     >>> load_pkl(tmpfile)
     [1, 2, 3, 4, 5]
@@ -983,7 +983,9 @@ def merge_csvs(data_folder, csv_lst, merged_csv):
 
     Example
     -------
-    >>> merge_csvs("tests/samples/annotation/",
+    >>> tmpdir = getfixture('tmpdir')
+    >>> os.symlink(os.path.realpath("tests/samples/annotation/speech.csv"), tmpdir / "speech.csv")
+    >>> merge_csvs(tmpdir,
     ... ["speech.csv", "speech.csv"],
     ... "test_csv_merge.csv")
     """

diff --git a/speechbrain/pretrained/interfaces.py b/speechbrain/pretrained/interfaces.py
@@ -766,10 +766,10 @@ class EncoderClassifier(Pretrained):
 
     >>> # Compute embeddings
     >>> signal, fs = torchaudio.load("tests/samples/single-mic/example1.wav")
-    >>> embeddings =  classifier.encode_batch(signal)
+    >>> embeddings = classifier.encode_batch(signal)
 
     >>> # Classification
-    >>> prediction =  classifier .classify_batch(signal)
+    >>> prediction = classifier.classify_batch(signal)
     """
 
     MODULES_NEEDED = [
@@ -2344,7 +2344,8 @@ class GraphemeToPhoneme(Pretrained, EncodeDecodePipelineMixin):
     >>> text = ("English is tough. It can be understood "
     ...         "through thorough thought though")
     >>> from speechbrain.pretrained import GraphemeToPhoneme
-    >>> g2p = GraphemeToPhoneme.from_hparams('path/to/model') # doctest: +SKIP
+    >>> tmpdir = getfixture('tmpdir')
+    >>> g2p = GraphemeToPhoneme.from_hparams('path/to/model', savedir=tmpdir) # doctest: +SKIP
     >>> phonemes = g2p.g2p(text) # doctest: +SKIP
     """
 
@@ -2590,7 +2591,8 @@ class Tacotron2(Pretrained):
 
     Example
     -------
-    >>> tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir")
+    >>> tmpdir_vocoder = getfixture('tmpdir') / "vocoder"
+    >>> tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir=tmpdir_vocoder)
     >>> mel_output, mel_length, alignment = tacotron2.encode_text("Mary had a little lamb")
     >>> items = [
     ...   "A quick brown fox jumped over the lazy dog",
@@ -2601,7 +2603,8 @@ class Tacotron2(Pretrained):
 
     >>> # One can combine the TTS model with a vocoder (that generates the final waveform)
     >>> # Intialize the Vocoder (HiFIGAN)
-    >>> hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")
+    >>> tmpdir_tts = getfixture('tmpdir') / "tts"
+    >>> hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir=tmpdir_tts)
     >>> # Running the TTS
     >>> mel_output, mel_length, alignment = tacotron2.encode_text("Mary had a little lamb")
     >>> # Running Vocoder (spectrogram-to-waveform)
@@ -2679,13 +2682,15 @@ class HIFIGAN(Pretrained):
 
     Example
     -------
-    >>> hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")
+    >>> tmpdir_vocoder = getfixture('tmpdir') / "vocoder"
+    >>> hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir=tmpdir_vocoder)
     >>> mel_specs = torch.rand(2, 80,298)
     >>> waveforms = hifi_gan.decode_batch(mel_specs)
 
     >>> # You can use the vocoder coupled with a TTS system
     >>>	# Intialize TTS (tacotron2)
-    >>>	tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tts")
+    >>> tmpdir_tts = getfixture('tmpdir') / "tts"
+    >>>	tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir=tmpdir_tts)
     >>>	# Running the TTS
     >>>	mel_output, mel_length, alignment = tacotron2.encode_text("Mary had a little lamb")
     >>>	# Running Vocoder (spectrogram-to-waveform)
@@ -2737,7 +2742,7 @@ def decode_spectrogram(self, spectrogram):
         audio can be saved by:
         >>> waveform = torch.rand(1, 666666)
         >>> sample_rate = 22050
-        >>> torchaudio.save("test.wav", waveform, sample_rate)
+        >>> torchaudio.save(str(getfixture('tmpdir') / "test.wav"), waveform, sample_rate)
         """
         if self.first_call:
             self.hparams.generator.remove_weight_norm()

diff --git a/speechbrain/tokenizers/SentencePiece.py b/speechbrain/tokenizers/SentencePiece.py
@@ -84,23 +84,21 @@ class SentencePiece:
     -------
     >>> import torch
     >>> dict_int2lab = {1: "HELLO", 2: "MORNING"}
-    >>> model_dir = "tests/unittests/tokenizer_data/"
+    >>> model_dir = getfixture('tmpdir') / "tokenizer_data"
     >>> # Example with csv
-    >>> annotation_train = "tests/unittests/tokenizer_data/dev-clean.csv"
+    >>> annotation_train = "tests/samples/annotation/dev-clean.csv"
     >>> annotation_read = "wrd"
     >>> model_type = "bpe"
-    >>> bpe = SentencePiece(model_dir,100, annotation_train, annotation_read,
-    ...                     model_type)
+    >>> bpe = SentencePiece(str(model_dir), 100, annotation_train, annotation_read, model_type)
     >>> batch_seq = torch.Tensor([[1, 2, 2, 1],[1, 2, 1, 0]])
     >>> batch_lens = torch.Tensor([1.0, 0.75])
     >>> encoded_seq_ids, encoded_seq_pieces = bpe(
     ...     batch_seq, batch_lens, dict_int2lab, task="encode"
     ... )
     >>> # Example using JSON
-    >>> annotation_train = "tests/unittests/tokenizer_data/dev-clean.json"
+    >>> annotation_train = str(model_dir + "/dev-clean.json")
     >>> annotation_read = "wrd"
-    >>> bpe = SentencePiece(model_dir,100, annotation_train, annotation_read,
-    ...                     model_type, annotation_format = 'json')
+    >>> bpe = SentencePiece(model_dir, 100, annotation_train, annotation_read, model_type, annotation_format = 'json')
     >>> encoded_seq_ids, encoded_seq_pieces = bpe(
     ...     batch_seq, batch_lens, dict_int2lab, task="encode"
     ... )
@@ -142,7 +140,12 @@ def __init__(
         if self.annotation_train is not None:
             ext = os.path.splitext(self.annotation_train)[1]
             if text_file is None:
-                text_file = self.annotation_train.replace(ext, ".txt")
+                text_file = os.path.join(
+                    model_dir,
+                    os.path.basename(self.annotation_train).replace(
+                        ext, ".txt"
+                    ),
+                )
             self.text_file = text_file
 
         self.prefix_model_file = os.path.join(

diff --git a/speechbrain/utils/check_HF_repo.py b/speechbrain/utils/check_HF_repo.py
@@ -11,7 +11,7 @@
 def run_HF_check(
     recipe_csvfile="tests/recipes.csv",
     field="HF_repo",
-    output_folder="HF_repos",
+    output_folder="tests/tmp/HF",
 ):
     """Checks if the code reported in the readme files of the HF repository is
     runnable. Note: the tests run the code marked as python in the readme file.

diff --git a/speechbrain/utils/recipe_tests.py b/speechbrain/utils/recipe_tests.py
@@ -2,6 +2,7 @@
 
 Authors
  * Mirco Ravanelli 2022
+ * Andreas Nautsch 2022
 """
 import os
 import re
@@ -318,7 +319,7 @@ def run_recipe_tests(
     test_field="test_debug_flags",
     check_field="test_debug_checks",
     run_opts="--device=cpu",
-    output_folder="tests/recipe_tests/",
+    output_folder="tests/tmp/recipes/",
     filters_fields=[],
     filters=[],
     do_checks=True,
@@ -429,9 +430,19 @@ def load_yaml_test(
     avoid_list=[
         "templates/hyperparameter_optimization_speaker_id/train.yaml",
         "templates/speaker_id/train.yaml",
+        # recipes creating errors if NVIDIA driver is not on one's system
+        "recipes/timers-and-such/multistage/hparams/train_LS_LM.yaml",
+        "recipes/timers-and-such/multistage/hparams/train_TAS_LM.yaml",
+        "recipes/timers-and-such/direct/hparams/train.yaml",
+        "recipes/timers-and-such/decoupled/hparams/train_LS_LM.yaml",
+        "recipes/timers-and-such/decoupled/hparams/train_TAS_LM.yaml",
+        "recipes/fluent-speech-commands/direct/hparams/train.yaml",
+        "recipes/CommonLanguage/lang_id/hparams/train_ecapa_tdnn.yaml",
+        "recipes/SLURP/direct/hparams/train.yaml",
     ],
-    data_folder="yaml_check_folder",
-    output_folder="yaml_check_folder",
+    rir_folder="tests/tmp/rir",
+    data_folder="tests/tmp/yaml",
+    output_folder="tests/tmp/yaml",
 ):
     """Tests if the yaml files can be loaded without errors.
 
@@ -453,6 +464,8 @@ def load_yaml_test(
         See above.
     avoid_list: list
         List of hparam file not to check.
+    rir_folder:
+        This overrides the rir_folder; rir_path, and openrir_folder usually specified in the hparam files.
     data_folder:
         This overrides the data_folder usually specified in the hparam files.
     output_folder:
@@ -470,19 +483,25 @@ def load_yaml_test(
     # Set data_foler and output folder
     data_folder = os.path.join(cwd, data_folder)
     output_folder = os.path.join(cwd, output_folder)
+    rir_folder = os.path.join(cwd, rir_folder)
 
     # Additional overrides
     add_overrides = {
         "manual_annot_folder": data_folder,
         "musan_folder": data_folder,
         "tea_models_dir": data_folder,
-        "rir_path": data_folder,
         "wsj_root": data_folder,
         "tokenizer_file": data_folder,
         "commonlanguage_folder": data_folder,
         "tea_infer_dir": data_folder,
         "original_data_folder": data_folder,
         "pretrain_st_dir": data_folder,
+        # RIR folder specifications -> all point to the same zip file: one download destination
+        "rir_path": rir_folder,
+        "rir_folder": rir_folder,
+        "openrir_folder": rir_folder,
+        "open_rir_folder": rir_folder,
+        "data_folder_rirs": rir_folder,
     }
 
     # Read the csv recipe file and detect which tests we have to run
@@ -523,10 +542,10 @@ def load_yaml_test(
         # Append additional overrides when needed
         with open(hparam_file) as f:
             for line in f:
-                for key in add_overrides.keys():
+                for key, value in add_overrides.items():
                     pattern = key + ":"
                     if pattern in line and line.find(pattern) == 0:
-                        overrides.update({key: data_folder})
+                        overrides.update({key: value})
 
         with open(hparam_file) as fin:
             try:

diff --git a/templates/enhancement/train.yaml b/templates/enhancement/train.yaml
@@ -25,6 +25,7 @@ data_folder: ./data
 output_folder: !ref ./results/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
+rir_folder: !ref <data_folder>
 
 # Path where data manifest files will be stored
 # The data manifest files are created by the data preparation script.
@@ -73,7 +74,7 @@ resynth: !name:speechbrain.processing.signal_processing.resynthesize
 # downloaded and prepared with this Environmental Corruption class.
 # The babble is generated from other utterances in each batch.
 env_corruption: !new:speechbrain.lobes.augment.EnvCorrupt
-    openrir_folder: !ref <data_folder>
+    openrir_folder: !ref <rir_folder>
     openrir_max_noise_len: 10
     noise_snr_low: 0
     noise_snr_high: 15

diff --git a/tests/.run-HF-checks.sh b/tests/.run-HF-checks.sh
@@ -1,3 +1,2 @@
 #!/bin/bash
-scp -r tests HF_repos
 python -c 'from speechbrain.utils.check_HF_repo import run_HF_check; print("TEST FAILED!") if not(run_HF_check()) else print("TEST PASSED!")'
diff --git a/tests/.run-load-yaml-tests.sh b/tests/.run-load-yaml-tests.sh
@@ -2,4 +2,6 @@
 pip install pesq
 pip install pystoi
 pip install librosa
+pip install tensorboard
+pip install transformers
 python -c 'from speechbrain.utils.recipe_tests import load_yaml_test; print("TEST FAILED!") if not(load_yaml_test()) else print("TEST PASSED")'
diff --git a/tests/unittests/tokenizer_data/dev-clean.csv → tests/samples/annotation/dev-clean.csv b/tests/unittests/tokenizer_data/dev-clean.csv → tests/samples/annotation/dev-clean.csv