Skip to content

File Helper

FileHelper

Helps loading audio segments.

Source code in pbp/file_helper.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
class FileHelper:
    """
    Helps loading audio segments.
    """

    def __init__(
        self,
        log,  # : loguru.Logger,
        json_base_dir: str,
        audio_base_dir: Optional[str] = None,
        audio_path_map_prefix: str = "",
        audio_path_prefix: str = "",
        segment_size_in_mins: int = 1,
        s3_client: Optional[BaseClient] = None,
        gs_client: Optional[GsClient] = None,
        download_dir: Optional[str] = None,
        assume_downloaded_files: bool = False,
        retain_downloaded_files: bool = False,
        print_downloading_lines: bool = False,
    ):
        """
        Handles file loading and path mapping for audio processing.

        Args:
            log: Logger instance.
            audio_base_dir (str, optional): Base directory for relative `path` attributes in JSON entries.
            audio_path_map_prefix (str, optional): Prefix mapping for resolving actual audio URIs.
                Example: `"s3://pacific-sound-256khz-2022~file:///PAM_Archive/2022"`.
            audio_path_prefix (str, optional): Ad hoc path prefix for sound file locations, e.g., `"/Volumes"`.
            segment_size_in_mins (int, optional): The size of each extracted audio segment in minutes. Defaults to `1`.
            s3_client (object, optional): S3 client for handling `s3://` URIs.
            gs_client (object, optional): Google Cloud Storage client for handling `gs://` URIs.
            download_dir (str, optional): Directory to save downloaded S3 files. Defaults to the current directory.
            assume_downloaded_files (bool, optional): If `True`, skips downloading files that already exist in `download_dir`.
            retain_downloaded_files (bool, optional): If `True`, does not remove downloaded files after use.
            print_downloading_lines (bool, optional): If `True`, prints `"downloading <uri>"` messages to the console.
        """
        self.log = log

        self.log.info(
            "Creating FileHelper:"
            + f"\n    json_base_dir:           {json_base_dir}"
            + (
                f"\n    audio_base_dir:          {audio_base_dir}"
                if audio_base_dir
                else ""
            )
            + (
                f"\n    audio_path_map_prefix:   '{audio_path_map_prefix}'"
                if audio_path_map_prefix
                else ""
            )
            + (
                f"\n    audio_path_prefix:       '{audio_path_prefix}'"
                if audio_path_prefix
                else ""
            )
            + f"\n    segment_size_in_mins:    {segment_size_in_mins}"
            + f"\n    s3_client:               {'(given)' if s3_client else 'None'}"
            + f"\n    gs_client:               {'(given)' if gs_client else 'None'}"
            + f"\n    download_dir:            {download_dir}"
            + f"\n    assume_downloaded_files: {assume_downloaded_files}"
            + f"\n    retain_downloaded_files: {retain_downloaded_files}"
            + f"\n    print_downloading_lines: {print_downloading_lines}"
            + "\n"
        )
        self.json_base_dir = json_base_dir
        self.audio_base_dir = audio_base_dir
        self.audio_path_map_prefix = audio_path_map_prefix
        self.audio_path_prefix = audio_path_prefix
        self.segment_size_in_mins = segment_size_in_mins
        self.s3_client = s3_client
        self.gs_client = gs_client
        self.download_dir: str = download_dir if download_dir else "."
        self.assume_downloaded_files = assume_downloaded_files
        self.retain_downloaded_files = retain_downloaded_files
        self.print_downloading_lines = print_downloading_lines

        self.sound_cache: Dict[str, SoundStatus] = {}

        # the following set by select_day:
        self.year: Optional[int] = None
        self.month: Optional[int] = None
        self.day: Optional[int] = None
        self.json_entries: Optional[List[JEntry]] = None

    def select_day(self, year: int, month: int, day: int) -> bool:
        """
        Selects the given day for subsequent processing of relevant audio segments.

        Args:
            year (int): The year.
            month (int): The month.
            day (int): The day.

        Returns:
            True only if selection was successful
        """

        self.log.info(f"Selecting day: {year:04}{month:02}{day:02}")

        json_uri = f"{self.json_base_dir}/{year:04}/{year:04}{month:02}{day:02}.json"
        json_contents = self._get_json(json_uri)
        if json_contents is None:
            self.log.error(f"{json_uri}: file not found\n")
            return False

        self.year = year
        self.month = month
        self.day = day
        self.json_entries = list(parse_json_contents(json_contents))
        return True

    def get_local_filename(self, uri: str) -> Optional[str]:
        """
        Returns the local filename for the given URI, which will be that of
        the downloaded file when the given uri is cloud based.

        Args:
            uri (str): The URI of the file.

        Returns:
            The local filename or None if error or if the scheme is not `s3` or `gs`.
        """
        parsed_uri = urlparse(uri)
        if parsed_uri.scheme in ("s3", "gs"):
            return _download(
                log=self.log,
                parsed_uri=parsed_uri,
                download_dir=self.download_dir,
                assume_downloaded_files=self.assume_downloaded_files,
                print_downloading_lines=self.print_downloading_lines,
                s3_client=self.s3_client,
                gs_client=self.gs_client,
            )

        return parsed_uri.path

    def day_completed(self):
        """
        ProcessHelper calls this to indicate that the day's processing is completed.
        Since a process is launched only for a day, we simply clear the cache.
        """
        # first, close all sound files still open:
        num_still_open = 0
        for c_uri, c_ss in list(self.sound_cache.items()):
            # due to some weird issues (when running under dask), let's be extra careful:
            # TODO clean up this!
            if (
                hasattr(c_ss, "sound_file")
                and hasattr(c_ss, "sound_file_open")
                and c_ss.sound_file_open
            ):
                c_ss.sound_file_open = False
                num_still_open += 1
                self.log.debug(f"Closing sound file for cached {c_uri=} {c_ss.age=}")
                c_ss.sound_file.close()
        self.log.debug(
            f"day_completed: closed {num_still_open} sound files that were still open."
        )

        # remove any downloaded files (cloud case):
        if not self.retain_downloaded_files:
            for c_ss in self.sound_cache.values():
                c_ss.remove_downloaded_file()

        self.sound_cache = {}

    def _get_json(self, uri: str) -> Optional[str]:
        parsed_uri = urlparse(uri)
        if parsed_uri.scheme == "s3":
            return self._get_json_s3(parsed_uri)
        #  simply assume local file:
        if os.name == "nt":
            return self._get_json_local(uri)
        else:
            return self._get_json_local(parsed_uri.path)

    def _get_json_s3(self, parsed_uri: ParseResult) -> Optional[str]:
        local_filename = _download(
            log=self.log,
            parsed_uri=parsed_uri,
            download_dir=self.download_dir,
            assume_downloaded_files=self.assume_downloaded_files,
            print_downloading_lines=self.print_downloading_lines,
            s3_client=self.s3_client,
            gs_client=self.gs_client,
        )
        if local_filename is None:
            return None
        return self._get_json_local(local_filename)

    def extract_audio_segment(
        self, at_hour: int, at_minute: int
    ) -> Optional[Tuple[AudioInfo, np.ndarray]]:
        """
        Extracts the audio segment at the given start time.
        For this it loads and aggregates the relevant audio segments.

        Args:
            at_hour (int): The hour when the audio segment was extracted.
            at_minute (int): The minute when the audio segment was extracted.

        Returns:
            A tuple (audio_info, audio_segment) or None
        """

        assert self.json_entries is not None
        assert self.year is not None
        assert self.month is not None
        assert self.day is not None

        intersections = get_intersecting_entries(
            self.log,
            self.json_entries,
            self.year,
            self.month,
            self.day,
            at_hour,
            at_minute,
            segment_size_in_mins=self.segment_size_in_mins,
        )

        audio_info: Optional[AudioInfo] = None

        aggregated_segment: Optional[np.ndarray] = None

        prefix = f"({at_hour:02}h:{at_minute:02}m)"
        for intersection in intersections:
            if intersection.duration_secs == 0:
                self.log.warning("No data from intersection")
                continue

            ss = self._get_sound_status(intersection.entry.uri)
            if ss.error is not None:
                return None

            self.log.debug(
                f"    {prefix} {intersection.duration_secs} secs from {ss.sound_filename}"
            )

            if audio_info is not None and not self._check_audio_info(
                audio_info, ss.audio_info
            ):
                return None  # error!

            audio_info = ss.audio_info

            start_sample = floor(intersection.start_secs * audio_info.samplerate)
            num_samples = ceil(intersection.duration_secs * audio_info.samplerate)

            try:
                new_pos = ss.sound_file.seek(start_sample)
                if new_pos != start_sample:
                    # no-data case, let's just read 0 samples to get an empty array:
                    audio_segment = ss.sound_file.read(0)
                else:
                    audio_segment = ss.sound_file.read(num_samples)
                    if len(audio_segment) < num_samples:
                        # partial-data case.
                        self.log.warning(
                            f"!!! partial data: {len(audio_segment)} < {num_samples}"
                        )

            except sf.LibsndfileError as e:
                self.log.error(f"{e}")
                return None

            if aggregated_segment is None:
                aggregated_segment = audio_segment
            else:
                aggregated_segment = np.concatenate((aggregated_segment, audio_segment))

        if aggregated_segment is not None:
            assert audio_info is not None
            return audio_info, aggregated_segment
        return None

    def _check_audio_info(self, ai1: AudioInfo, ai2: AudioInfo) -> bool:
        if ai1.samplerate != ai2.samplerate:
            self.log.error(
                f"UNEXPECTED: sample rate mismatch: {ai1.samplerate} vs {ai2.samplerate}"
            )
            return False
        if ai1.channels != ai2.channels:
            self.log.error(
                f"UNEXPECTED: channel count mismatch: {ai1.channels} vs {ai2.channels}"
            )
            return False
        if ai1.subtype != ai2.subtype:
            self.log.error(
                f"UNEXPECTED: subtype mismatch: {ai1.subtype} vs {ai2.subtype}"
            )
            return False
        return True

    def _get_sound_status(self, uri: str) -> SoundStatus:
        """
        Returns a SoundStatus object for the given uri.
        Internally, the 'age' attribute helps to keep the relevant files open
        as long as recently used. Note that traversal of the files indicated in the
        JSON array happens in a monotonically increasing order in time, so we
        can increment the 'age' for all entries in the cache except for the uri
        just requested.

        Args:
            uri: The URI of the sound file.

        Returns:
            The SoundStatus object.
        """
        self.log.debug(f"_get_sound_status: {uri=}")
        ss = self.sound_cache.get(uri)
        if ss is None:
            # currently cached ones get a bit older:
            for c_ss in self.sound_cache.values():
                c_ss.age += 1

            self.log.debug(f"SoundStatus: creating for {uri=}")
            ss = SoundStatus(
                log=self.log,
                uri=uri,
                audio_base_dir=self.audio_base_dir,
                audio_path_map_prefix=self.audio_path_map_prefix,
                audio_path_prefix=self.audio_path_prefix,
                download_dir=self.download_dir,
                assume_downloaded_files=self.assume_downloaded_files,
                print_downloading_lines=self.print_downloading_lines,
                s3_client=self.s3_client,
                gs_client=self.gs_client,
            )
            self.sound_cache[uri] = ss
        else:
            self.log.debug(f"SoundStatus: already available for {uri=}")

        # close and remove files in the cache that are not fresh enough in terms
        # of not being recently used
        for c_uri, c_ss in list(self.sound_cache.items()):
            if uri != c_uri and c_ss.age > 2 and c_ss.sound_file_open:
                self.log.debug(
                    f"Closing sound file for cached uri={c_uri} age={c_ss.age}"
                )
                c_ss.sound_file.close()
                c_ss.sound_file_open = False
                if not self.retain_downloaded_files:
                    c_ss.remove_downloaded_file()

        def log_msg():
            c_sss = self.sound_cache.values()
            open_files = len([c_ss for c_ss in c_sss if c_ss.sound_file_open])
            ages = [c_ss.age for c_ss in c_sss]
            return f"{open_files=}  ages={brief_list(ages)}"

        self.log.opt(lazy=True).debug("_get_sound_status: {}", log_msg)

        return ss

    def _get_json_local(self, filename: str) -> Optional[str]:
        try:
            with open(filename, "r", encoding="UTF-8") as f:
                return f.read()
        except IOError as e:
            self.log.error(f"Error reading {filename}: {e}")
            return None

__init__(log, json_base_dir, audio_base_dir=None, audio_path_map_prefix='', audio_path_prefix='', segment_size_in_mins=1, s3_client=None, gs_client=None, download_dir=None, assume_downloaded_files=False, retain_downloaded_files=False, print_downloading_lines=False)

Handles file loading and path mapping for audio processing.

Parameters:

Name Type Description Default
log

Logger instance.

required
audio_base_dir str

Base directory for relative path attributes in JSON entries.

None
audio_path_map_prefix str

Prefix mapping for resolving actual audio URIs. Example: "s3://pacific-sound-256khz-2022~file:///PAM_Archive/2022".

''
audio_path_prefix str

Ad hoc path prefix for sound file locations, e.g., "/Volumes".

''
segment_size_in_mins int

The size of each extracted audio segment in minutes. Defaults to 1.

1
s3_client object

S3 client for handling s3:// URIs.

None
gs_client object

Google Cloud Storage client for handling gs:// URIs.

None
download_dir str

Directory to save downloaded S3 files. Defaults to the current directory.

None
assume_downloaded_files bool

If True, skips downloading files that already exist in download_dir.

False
retain_downloaded_files bool

If True, does not remove downloaded files after use.

False
print_downloading_lines bool

If True, prints "downloading <uri>" messages to the console.

False
Source code in pbp/file_helper.py
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
def __init__(
    self,
    log,  # : loguru.Logger,
    json_base_dir: str,
    audio_base_dir: Optional[str] = None,
    audio_path_map_prefix: str = "",
    audio_path_prefix: str = "",
    segment_size_in_mins: int = 1,
    s3_client: Optional[BaseClient] = None,
    gs_client: Optional[GsClient] = None,
    download_dir: Optional[str] = None,
    assume_downloaded_files: bool = False,
    retain_downloaded_files: bool = False,
    print_downloading_lines: bool = False,
):
    """
    Handles file loading and path mapping for audio processing.

    Args:
        log: Logger instance.
        audio_base_dir (str, optional): Base directory for relative `path` attributes in JSON entries.
        audio_path_map_prefix (str, optional): Prefix mapping for resolving actual audio URIs.
            Example: `"s3://pacific-sound-256khz-2022~file:///PAM_Archive/2022"`.
        audio_path_prefix (str, optional): Ad hoc path prefix for sound file locations, e.g., `"/Volumes"`.
        segment_size_in_mins (int, optional): The size of each extracted audio segment in minutes. Defaults to `1`.
        s3_client (object, optional): S3 client for handling `s3://` URIs.
        gs_client (object, optional): Google Cloud Storage client for handling `gs://` URIs.
        download_dir (str, optional): Directory to save downloaded S3 files. Defaults to the current directory.
        assume_downloaded_files (bool, optional): If `True`, skips downloading files that already exist in `download_dir`.
        retain_downloaded_files (bool, optional): If `True`, does not remove downloaded files after use.
        print_downloading_lines (bool, optional): If `True`, prints `"downloading <uri>"` messages to the console.
    """
    self.log = log

    self.log.info(
        "Creating FileHelper:"
        + f"\n    json_base_dir:           {json_base_dir}"
        + (
            f"\n    audio_base_dir:          {audio_base_dir}"
            if audio_base_dir
            else ""
        )
        + (
            f"\n    audio_path_map_prefix:   '{audio_path_map_prefix}'"
            if audio_path_map_prefix
            else ""
        )
        + (
            f"\n    audio_path_prefix:       '{audio_path_prefix}'"
            if audio_path_prefix
            else ""
        )
        + f"\n    segment_size_in_mins:    {segment_size_in_mins}"
        + f"\n    s3_client:               {'(given)' if s3_client else 'None'}"
        + f"\n    gs_client:               {'(given)' if gs_client else 'None'}"
        + f"\n    download_dir:            {download_dir}"
        + f"\n    assume_downloaded_files: {assume_downloaded_files}"
        + f"\n    retain_downloaded_files: {retain_downloaded_files}"
        + f"\n    print_downloading_lines: {print_downloading_lines}"
        + "\n"
    )
    self.json_base_dir = json_base_dir
    self.audio_base_dir = audio_base_dir
    self.audio_path_map_prefix = audio_path_map_prefix
    self.audio_path_prefix = audio_path_prefix
    self.segment_size_in_mins = segment_size_in_mins
    self.s3_client = s3_client
    self.gs_client = gs_client
    self.download_dir: str = download_dir if download_dir else "."
    self.assume_downloaded_files = assume_downloaded_files
    self.retain_downloaded_files = retain_downloaded_files
    self.print_downloading_lines = print_downloading_lines

    self.sound_cache: Dict[str, SoundStatus] = {}

    # the following set by select_day:
    self.year: Optional[int] = None
    self.month: Optional[int] = None
    self.day: Optional[int] = None
    self.json_entries: Optional[List[JEntry]] = None

select_day(year, month, day)

Selects the given day for subsequent processing of relevant audio segments.

Parameters:

Name Type Description Default
year int

The year.

required
month int

The month.

required
day int

The day.

required

Returns:

Type Description
bool

True only if selection was successful

Source code in pbp/file_helper.py
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
def select_day(self, year: int, month: int, day: int) -> bool:
    """
    Selects the given day for subsequent processing of relevant audio segments.

    Args:
        year (int): The year.
        month (int): The month.
        day (int): The day.

    Returns:
        True only if selection was successful
    """

    self.log.info(f"Selecting day: {year:04}{month:02}{day:02}")

    json_uri = f"{self.json_base_dir}/{year:04}/{year:04}{month:02}{day:02}.json"
    json_contents = self._get_json(json_uri)
    if json_contents is None:
        self.log.error(f"{json_uri}: file not found\n")
        return False

    self.year = year
    self.month = month
    self.day = day
    self.json_entries = list(parse_json_contents(json_contents))
    return True

get_local_filename(uri)

Returns the local filename for the given URI, which will be that of the downloaded file when the given uri is cloud based.

Parameters:

Name Type Description Default
uri str

The URI of the file.

required

Returns:

Type Description
Optional[str]

The local filename or None if error or if the scheme is not s3 or gs.

Source code in pbp/file_helper.py
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
def get_local_filename(self, uri: str) -> Optional[str]:
    """
    Returns the local filename for the given URI, which will be that of
    the downloaded file when the given uri is cloud based.

    Args:
        uri (str): The URI of the file.

    Returns:
        The local filename or None if error or if the scheme is not `s3` or `gs`.
    """
    parsed_uri = urlparse(uri)
    if parsed_uri.scheme in ("s3", "gs"):
        return _download(
            log=self.log,
            parsed_uri=parsed_uri,
            download_dir=self.download_dir,
            assume_downloaded_files=self.assume_downloaded_files,
            print_downloading_lines=self.print_downloading_lines,
            s3_client=self.s3_client,
            gs_client=self.gs_client,
        )

    return parsed_uri.path

day_completed()

ProcessHelper calls this to indicate that the day's processing is completed. Since a process is launched only for a day, we simply clear the cache.

Source code in pbp/file_helper.py
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
def day_completed(self):
    """
    ProcessHelper calls this to indicate that the day's processing is completed.
    Since a process is launched only for a day, we simply clear the cache.
    """
    # first, close all sound files still open:
    num_still_open = 0
    for c_uri, c_ss in list(self.sound_cache.items()):
        # due to some weird issues (when running under dask), let's be extra careful:
        # TODO clean up this!
        if (
            hasattr(c_ss, "sound_file")
            and hasattr(c_ss, "sound_file_open")
            and c_ss.sound_file_open
        ):
            c_ss.sound_file_open = False
            num_still_open += 1
            self.log.debug(f"Closing sound file for cached {c_uri=} {c_ss.age=}")
            c_ss.sound_file.close()
    self.log.debug(
        f"day_completed: closed {num_still_open} sound files that were still open."
    )

    # remove any downloaded files (cloud case):
    if not self.retain_downloaded_files:
        for c_ss in self.sound_cache.values():
            c_ss.remove_downloaded_file()

    self.sound_cache = {}

extract_audio_segment(at_hour, at_minute)

Extracts the audio segment at the given start time. For this it loads and aggregates the relevant audio segments.

Parameters:

Name Type Description Default
at_hour int

The hour when the audio segment was extracted.

required
at_minute int

The minute when the audio segment was extracted.

required

Returns:

Type Description
Optional[Tuple[AudioInfo, ndarray]]

A tuple (audio_info, audio_segment) or None

Source code in pbp/file_helper.py
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
def extract_audio_segment(
    self, at_hour: int, at_minute: int
) -> Optional[Tuple[AudioInfo, np.ndarray]]:
    """
    Extracts the audio segment at the given start time.
    For this it loads and aggregates the relevant audio segments.

    Args:
        at_hour (int): The hour when the audio segment was extracted.
        at_minute (int): The minute when the audio segment was extracted.

    Returns:
        A tuple (audio_info, audio_segment) or None
    """

    assert self.json_entries is not None
    assert self.year is not None
    assert self.month is not None
    assert self.day is not None

    intersections = get_intersecting_entries(
        self.log,
        self.json_entries,
        self.year,
        self.month,
        self.day,
        at_hour,
        at_minute,
        segment_size_in_mins=self.segment_size_in_mins,
    )

    audio_info: Optional[AudioInfo] = None

    aggregated_segment: Optional[np.ndarray] = None

    prefix = f"({at_hour:02}h:{at_minute:02}m)"
    for intersection in intersections:
        if intersection.duration_secs == 0:
            self.log.warning("No data from intersection")
            continue

        ss = self._get_sound_status(intersection.entry.uri)
        if ss.error is not None:
            return None

        self.log.debug(
            f"    {prefix} {intersection.duration_secs} secs from {ss.sound_filename}"
        )

        if audio_info is not None and not self._check_audio_info(
            audio_info, ss.audio_info
        ):
            return None  # error!

        audio_info = ss.audio_info

        start_sample = floor(intersection.start_secs * audio_info.samplerate)
        num_samples = ceil(intersection.duration_secs * audio_info.samplerate)

        try:
            new_pos = ss.sound_file.seek(start_sample)
            if new_pos != start_sample:
                # no-data case, let's just read 0 samples to get an empty array:
                audio_segment = ss.sound_file.read(0)
            else:
                audio_segment = ss.sound_file.read(num_samples)
                if len(audio_segment) < num_samples:
                    # partial-data case.
                    self.log.warning(
                        f"!!! partial data: {len(audio_segment)} < {num_samples}"
                    )

        except sf.LibsndfileError as e:
            self.log.error(f"{e}")
            return None

        if aggregated_segment is None:
            aggregated_segment = audio_segment
        else:
            aggregated_segment = np.concatenate((aggregated_segment, audio_segment))

    if aggregated_segment is not None:
        assert audio_info is not None
        return audio_info, aggregated_segment
    return None