File Helper

FileHelper¶

Helps loading audio segments.

Source code in pbp/hmb_gen/file_helper.py

class FileHelper:
    """
    Helps loading audio segments.
    """

    def __init__(
        self,
        log: "loguru.Logger",
        json_base_dir: str,
        audio_base_dir: Optional[str] = None,
        audio_path_map_prefix: str = "",
        audio_path_prefix: str = "",
        segment_size_in_secs: int = 60,
        s3_client: Optional[BaseClient] = None,
        gs_client: Optional[GsClient] = None,
        download_dir: Optional[str] = None,
        assume_downloaded_files: bool = False,
        retain_downloaded_files: bool = False,
        print_downloading_lines: bool = False,
    ):
        """
        Handles file loading and path mapping for audio processing.

        Args:
            log: Logger instance.
            audio_base_dir (str, optional): Base directory for relative `path` attributes in JSON entries.
            audio_path_map_prefix (str, optional): Prefix mapping for resolving actual audio URIs.
                Example: `"s3://pacific-sound-256khz-2022~file:///PAM_Archive/2022"`.
            audio_path_prefix (str, optional): Ad hoc path prefix for sound file locations, e.g., `"/Volumes"`.
            segment_size_in_secs (int, optional): The size of each extracted audio segment in seconds. Defaults to `60`.
            s3_client (object, optional): S3 client for handling `s3://` URIs.
            gs_client (object, optional): Google Cloud Storage client for handling `gs://` URIs.
            download_dir (str, optional): Directory to save downloaded S3 files. Defaults to the current directory.
            assume_downloaded_files (bool, optional): If `True`, skips downloading files that already exist in `download_dir`.
            retain_downloaded_files (bool, optional): If `True`, does not remove downloaded files after use.
            print_downloading_lines (bool, optional): If `True`, prints `"downloading <uri>"` messages to the console.
        """
        self.log = log

        self.log.info(
            "Creating FileHelper:"
            + f"\n    json_base_dir:           {json_base_dir}"
            + (
                f"\n    audio_base_dir:          {audio_base_dir}"
                if audio_base_dir
                else ""
            )
            + (
                f"\n    audio_path_map_prefix:   '{audio_path_map_prefix}'"
                if audio_path_map_prefix
                else ""
            )
            + (
                f"\n    audio_path_prefix:       '{audio_path_prefix}'"
                if audio_path_prefix
                else ""
            )
            + f"\n    segment_size_in_secs:    {segment_size_in_secs}"
            + f"\n    s3_client:               {'(given)' if s3_client else 'None'}"
            + f"\n    gs_client:               {'(given)' if gs_client else 'None'}"
            + f"\n    download_dir:            {download_dir}"
            + f"\n    assume_downloaded_files: {assume_downloaded_files}"
            + f"\n    retain_downloaded_files: {retain_downloaded_files}"
            + f"\n    print_downloading_lines: {print_downloading_lines}"
            + "\n"
        )
        self.json_base_dir = json_base_dir
        self.audio_base_dir = audio_base_dir
        self.audio_path_map_prefix = audio_path_map_prefix
        self.audio_path_prefix = audio_path_prefix
        self.segment_size_in_secs = segment_size_in_secs
        self.s3_client = s3_client
        self.gs_client = gs_client
        self.download_dir: str = download_dir if download_dir else "."
        self.assume_downloaded_files = assume_downloaded_files
        self.retain_downloaded_files = retain_downloaded_files
        self.print_downloading_lines = print_downloading_lines

        # Create URI handler for file operations
        self.uri_handler = UriHandler(
            log=log,
            base_dir=audio_base_dir,
            path_map_prefix=audio_path_map_prefix,
            path_prefix=audio_path_prefix,
            download_dir=download_dir,
            assume_downloaded_files=assume_downloaded_files,
            print_downloading_lines=print_downloading_lines,
            s3_client=s3_client,
            gs_client=gs_client,
        )

        self.sound_cache: Dict[str, SoundStatus] = {}

        # the following set by select_day:
        self.year: Optional[int] = None
        self.month: Optional[int] = None
        self.day: Optional[int] = None
        self.json_entries: Optional[List[JEntry]] = None

    def select_day(self, year: int, month: int, day: int) -> bool:
        """
        Selects the given day for subsequent processing of relevant audio segments.

        Args:
            year (int): The year.
            month (int): The month.
            day (int): The day.

        Returns:
            True only if selection was successful
        """

        self.log.info(f"Selecting day: {year:04}{month:02}{day:02}")

        json_uri = f"{self.json_base_dir}/{year:04}/{year:04}{month:02}{day:02}.json"
        json_contents = self._get_json(json_uri)
        if json_contents is None:
            self.log.error(f"{json_uri}: file not found\n")
            return False

        self.year = year
        self.month = month
        self.day = day
        self.json_entries = list(parse_json_contents(json_contents))
        return True

    def get_local_filename(self, uri: str) -> Optional[str]:
        """
        Returns the local filename for the given URI, which will be that of
        the downloaded file when the given uri is cloud based.

        Args:
            uri (str): The URI of the file.

        Returns:
            The local filename or None if error or if the scheme is not `s3` or `gs`.
        """
        if self.uri_handler.is_cloud_uri(uri):
            return self.uri_handler.get_local_filename(uri)

        _, parsed_uri = self.uri_handler.resolve_uri(uri)
        return parsed_uri.path

    def day_completed(self):
        """
        ProcessHelper calls this to indicate that the day's processing is completed.
        Since a process is launched only for a day, we simply clear the cache.
        """
        # first, close all sound files still open:
        num_still_open = 0
        for c_uri, c_ss in list(self.sound_cache.items()):
            # due to some weird issues (when running under dask), let's be extra careful:
            # TODO clean up this!
            if (
                hasattr(c_ss, "sound_file")
                and hasattr(c_ss, "sound_file_open")
                and c_ss.sound_file_open
            ):
                c_ss.sound_file_open = False
                num_still_open += 1
                self.log.debug(f"Closing sound file for cached {c_uri=} {c_ss.age=}")
                c_ss.sound_file.close()
        self.log.debug(
            f"day_completed: closed {num_still_open} sound files that were still open."
        )

        # remove any downloaded files (cloud case):
        if not self.retain_downloaded_files:
            for c_ss in self.sound_cache.values():
                c_ss.remove_downloaded_file()

        self.sound_cache = {}

    def _get_json(self, uri: str) -> Optional[str]:
        local_filename = self.uri_handler.get_local_filename(uri)
        if local_filename is None:
            return None
        return self._get_json_local(local_filename)

    def extract_audio_segment(
        self,
        at_hour: int,
        at_minute: int,
        at_second: int,
        exclude_tone_calibration_seconds: Optional[int],
    ) -> Optional[ExtractedAudioSegment]:
        """
        Extracts the audio segment at the given start time.
        For this it loads and aggregates the relevant audio segments.

        Args:
            at_hour (int): The hour when the audio segment was extracted.
            at_minute (int): The minute when the audio segment was extracted.
            at_second (int): The second when the audio segment was extracted.
            exclude_tone_calibration_seconds (Optional[int]): If given and the
                resulting segment would overlap with the beginning of associated file,
                then such segment will not include the overlapping number of seconds.

        Returns:
            ExtractedAudioSegment or None
        """

        assert self.json_entries is not None
        assert self.year is not None
        assert self.month is not None
        assert self.day is not None

        intersections = get_intersecting_entries(
            self.log,
            self.json_entries,
            self.year,
            self.month,
            self.day,
            at_hour,
            at_minute,
            at_second,
            segment_size_in_secs=self.segment_size_in_secs,
        )

        audio_info: Optional[AudioInfo] = None

        aggregated_segment: Optional[np.ndarray] = None

        prefix = f"({at_hour:02}h:{at_minute:02}m)"
        for i, intersection in enumerate(intersections):
            if intersection.duration_secs == 0:
                self.log.trace(
                    f"{prefix}: No data from intersection {i} for {intersection.entry.uri}"
                )
                continue

            ss = self._get_sound_status(intersection.entry.uri)
            if ss.error is not None:
                return None

            self.log.debug(
                f"    {prefix} {intersection.duration_secs} secs from {ss.sound_filename}"
            )

            if audio_info is not None and not self._check_audio_info(
                audio_info, ss.audio_info
            ):
                return None  # error!

            audio_info = ss.audio_info

            start_secs = intersection.start_secs
            duration_secs = intersection.duration_secs

            if (
                exclude_tone_calibration_seconds is not None
                and exclude_tone_calibration_seconds > 0
            ):
                if start_secs < exclude_tone_calibration_seconds:
                    # Sanity check:
                    if (
                        exclude_tone_calibration_seconds
                        >= intersection.entry.duration_secs
                    ):
                        self.log.warning(
                            f"!!! {exclude_tone_calibration_seconds=} "
                            f"exceeds {intersection.entry.duration_secs=}"
                        )
                        continue

                    # `start_secs` is relative to the start of the file, so
                    # exclude_tone_calibration_seconds takes effect here:
                    diff_seconds = exclude_tone_calibration_seconds - start_secs
                    start_secs = exclude_tone_calibration_seconds
                    duration_secs -= diff_seconds

            start_sample = floor(start_secs * audio_info.samplerate)
            num_samples = ceil(duration_secs * audio_info.samplerate)

            try:
                new_pos = ss.sound_file.seek(start_sample)
                if new_pos != start_sample:
                    # no-data case, let's just read 0 samples to get an empty array:
                    audio_segment = ss.sound_file.read(0)
                else:
                    audio_segment = ss.sound_file.read(num_samples)
                    if len(audio_segment) < num_samples:
                        # partial-data case.
                        self.log.warning(
                            f"!!! partial data: {len(audio_segment)} < {num_samples}"
                        )

            except sf.LibsndfileError as e:
                self.log.exception(f"{e}")
                return None

            if aggregated_segment is None:
                aggregated_segment = audio_segment
            else:
                aggregated_segment = np.concatenate((aggregated_segment, audio_segment))

        if aggregated_segment is not None:
            assert audio_info is not None
            return ExtractedAudioSegment(audio_info, aggregated_segment)
        return None

    def _check_audio_info(self, ai1: AudioInfo, ai2: AudioInfo) -> bool:
        if ai1.samplerate != ai2.samplerate:
            self.log.error(
                f"UNEXPECTED: sample rate mismatch: {ai1.samplerate} vs {ai2.samplerate}"
            )
            return False
        if ai1.channels != ai2.channels:
            self.log.error(
                f"UNEXPECTED: channel count mismatch: {ai1.channels} vs {ai2.channels}"
            )
            return False
        if ai1.subtype != ai2.subtype:
            self.log.error(
                f"UNEXPECTED: subtype mismatch: {ai1.subtype} vs {ai2.subtype}"
            )
            return False
        return True

    def _get_sound_status(self, uri: str) -> SoundStatus:
        """
        Returns a SoundStatus object for the given uri.
        Internally, the 'age' attribute helps to keep the relevant files open
        as long as recently used. Note that traversal of the files indicated in the
        JSON array happens in a monotonically increasing order in time, so we
        can increment the 'age' for all entries in the cache except for the uri
        just requested.

        Args:
            uri: The URI of the sound file.

        Returns:
            The SoundStatus object.
        """
        self.log.debug(f"_get_sound_status: {uri=}")
        ss = self.sound_cache.get(uri)
        if ss is None:
            # currently cached ones get a bit older:
            for c_ss in self.sound_cache.values():
                c_ss.age += 1

            self.log.debug(f"SoundStatus: creating for {uri=}")
            ss = SoundStatus(
                log=self.log,
                uri=uri,
                uri_handler=self.uri_handler,
            )
            self.sound_cache[uri] = ss
        else:
            self.log.debug(f"SoundStatus: already available for {uri=}")

        # close and remove files in the cache that are not fresh enough in terms
        # of not being recently used
        for c_uri, c_ss in list(self.sound_cache.items()):
            if uri != c_uri and c_ss.age > 2 and c_ss.sound_file_open:
                self.log.debug(
                    f"Closing sound file for cached uri={c_uri} age={c_ss.age}"
                )
                c_ss.sound_file.close()
                c_ss.sound_file_open = False
                if not self.retain_downloaded_files:
                    c_ss.remove_downloaded_file()

        def log_msg():
            c_sss = self.sound_cache.values()
            open_files = len([c_ss for c_ss in c_sss if c_ss.sound_file_open])
            ages = [c_ss.age for c_ss in c_sss]
            return f"{open_files=}  ages={brief_list(ages)}"

        self.log.opt(lazy=True).debug("_get_sound_status: {}", log_msg)

        return ss

    def _get_json_local(self, filename: str) -> Optional[str]:
        try:
            with open(filename, "r", encoding="UTF-8") as f:
                return f.read()
        except IOError as e:
            self.log.exception(f"Error reading {filename}: {e}")
            return None

`init(log, json_base_dir, audio_base_dir=None, audio_path_map_prefix='', audio_path_prefix='', segment_size_in_secs=60, s3_client=None, gs_client=None, download_dir=None, assume_downloaded_files=False, retain_downloaded_files=False, print_downloading_lines=False)` ¶

Handles file loading and path mapping for audio processing.

Parameters:

Name	Type	Description	Default
`log`	`Logger`	Logger instance.	required
`audio_base_dir`	`str`	Base directory for relative `path` attributes in JSON entries.	`None`
`audio_path_map_prefix`	`str`	Prefix mapping for resolving actual audio URIs. Example: `"s3://pacific-sound-256khz-2022~file:///PAM_Archive/2022"`.	`''`
`audio_path_prefix`	`str`	Ad hoc path prefix for sound file locations, e.g., `"/Volumes"`.	`''`
`segment_size_in_secs`	`int`	The size of each extracted audio segment in seconds. Defaults to `60`.	`60`
`s3_client`	`object`	S3 client for handling `s3://` URIs.	`None`
`gs_client`	`object`	Google Cloud Storage client for handling `gs://` URIs.	`None`
`download_dir`	`str`	Directory to save downloaded S3 files. Defaults to the current directory.	`None`
`assume_downloaded_files`	`bool`	If `True`, skips downloading files that already exist in `download_dir`.	`False`
`retain_downloaded_files`	`bool`	If `True`, does not remove downloaded files after use.	`False`
`print_downloading_lines`	`bool`	If `True`, prints `"downloading <uri>"` messages to the console.	`False`

Source code in pbp/hmb_gen/file_helper.py

def __init__(
    self,
    log: "loguru.Logger",
    json_base_dir: str,
    audio_base_dir: Optional[str] = None,
    audio_path_map_prefix: str = "",
    audio_path_prefix: str = "",
    segment_size_in_secs: int = 60,
    s3_client: Optional[BaseClient] = None,
    gs_client: Optional[GsClient] = None,
    download_dir: Optional[str] = None,
    assume_downloaded_files: bool = False,
    retain_downloaded_files: bool = False,
    print_downloading_lines: bool = False,
):
    """
    Handles file loading and path mapping for audio processing.

    Args:
        log: Logger instance.
        audio_base_dir (str, optional): Base directory for relative `path` attributes in JSON entries.
        audio_path_map_prefix (str, optional): Prefix mapping for resolving actual audio URIs.
            Example: `"s3://pacific-sound-256khz-2022~file:///PAM_Archive/2022"`.
        audio_path_prefix (str, optional): Ad hoc path prefix for sound file locations, e.g., `"/Volumes"`.
        segment_size_in_secs (int, optional): The size of each extracted audio segment in seconds. Defaults to `60`.
        s3_client (object, optional): S3 client for handling `s3://` URIs.
        gs_client (object, optional): Google Cloud Storage client for handling `gs://` URIs.
        download_dir (str, optional): Directory to save downloaded S3 files. Defaults to the current directory.
        assume_downloaded_files (bool, optional): If `True`, skips downloading files that already exist in `download_dir`.
        retain_downloaded_files (bool, optional): If `True`, does not remove downloaded files after use.
        print_downloading_lines (bool, optional): If `True`, prints `"downloading <uri>"` messages to the console.
    """
    self.log = log

    self.log.info(
        "Creating FileHelper:"
        + f"\n    json_base_dir:           {json_base_dir}"
        + (
            f"\n    audio_base_dir:          {audio_base_dir}"
            if audio_base_dir
            else ""
        )
        + (
            f"\n    audio_path_map_prefix:   '{audio_path_map_prefix}'"
            if audio_path_map_prefix
            else ""
        )
        + (
            f"\n    audio_path_prefix:       '{audio_path_prefix}'"
            if audio_path_prefix
            else ""
        )
        + f"\n    segment_size_in_secs:    {segment_size_in_secs}"
        + f"\n    s3_client:               {'(given)' if s3_client else 'None'}"
        + f"\n    gs_client:               {'(given)' if gs_client else 'None'}"
        + f"\n    download_dir:            {download_dir}"
        + f"\n    assume_downloaded_files: {assume_downloaded_files}"
        + f"\n    retain_downloaded_files: {retain_downloaded_files}"
        + f"\n    print_downloading_lines: {print_downloading_lines}"
        + "\n"
    )
    self.json_base_dir = json_base_dir
    self.audio_base_dir = audio_base_dir
    self.audio_path_map_prefix = audio_path_map_prefix
    self.audio_path_prefix = audio_path_prefix
    self.segment_size_in_secs = segment_size_in_secs
    self.s3_client = s3_client
    self.gs_client = gs_client
    self.download_dir: str = download_dir if download_dir else "."
    self.assume_downloaded_files = assume_downloaded_files
    self.retain_downloaded_files = retain_downloaded_files
    self.print_downloading_lines = print_downloading_lines

    # Create URI handler for file operations
    self.uri_handler = UriHandler(
        log=log,
        base_dir=audio_base_dir,
        path_map_prefix=audio_path_map_prefix,
        path_prefix=audio_path_prefix,
        download_dir=download_dir,
        assume_downloaded_files=assume_downloaded_files,
        print_downloading_lines=print_downloading_lines,
        s3_client=s3_client,
        gs_client=gs_client,
    )

    self.sound_cache: Dict[str, SoundStatus] = {}

    # the following set by select_day:
    self.year: Optional[int] = None
    self.month: Optional[int] = None
    self.day: Optional[int] = None
    self.json_entries: Optional[List[JEntry]] = None

`select_day(year, month, day)` ¶

Selects the given day for subsequent processing of relevant audio segments.

Parameters:

Name	Type	Description	Default
`year`	`int`	The year.	required
`month`	`int`	The month.	required
`day`	`int`	The day.	required

Returns:

Type	Description
`bool`	True only if selection was successful

Source code in pbp/hmb_gen/file_helper.py

def select_day(self, year: int, month: int, day: int) -> bool:
    """
    Selects the given day for subsequent processing of relevant audio segments.

    Args:
        year (int): The year.
        month (int): The month.
        day (int): The day.

    Returns:
        True only if selection was successful
    """

    self.log.info(f"Selecting day: {year:04}{month:02}{day:02}")

    json_uri = f"{self.json_base_dir}/{year:04}/{year:04}{month:02}{day:02}.json"
    json_contents = self._get_json(json_uri)
    if json_contents is None:
        self.log.error(f"{json_uri}: file not found\n")
        return False

    self.year = year
    self.month = month
    self.day = day
    self.json_entries = list(parse_json_contents(json_contents))
    return True

`get_local_filename(uri)` ¶

Returns the local filename for the given URI, which will be that of the downloaded file when the given uri is cloud based.

Parameters:

Name	Type	Description	Default
`uri`	`str`	The URI of the file.	required

Returns:

Type	Description
`Optional[str]`	The local filename or None if error or if the scheme is not `s3` or `gs`.

Source code in pbp/hmb_gen/file_helper.py

def get_local_filename(self, uri: str) -> Optional[str]:
    """
    Returns the local filename for the given URI, which will be that of
    the downloaded file when the given uri is cloud based.

    Args:
        uri (str): The URI of the file.

    Returns:
        The local filename or None if error or if the scheme is not `s3` or `gs`.
    """
    if self.uri_handler.is_cloud_uri(uri):
        return self.uri_handler.get_local_filename(uri)

    _, parsed_uri = self.uri_handler.resolve_uri(uri)
    return parsed_uri.path

`day_completed()` ¶

ProcessHelper calls this to indicate that the day's processing is completed. Since a process is launched only for a day, we simply clear the cache.

Source code in pbp/hmb_gen/file_helper.py

def day_completed(self):
    """
    ProcessHelper calls this to indicate that the day's processing is completed.
    Since a process is launched only for a day, we simply clear the cache.
    """
    # first, close all sound files still open:
    num_still_open = 0
    for c_uri, c_ss in list(self.sound_cache.items()):
        # due to some weird issues (when running under dask), let's be extra careful:
        # TODO clean up this!
        if (
            hasattr(c_ss, "sound_file")
            and hasattr(c_ss, "sound_file_open")
            and c_ss.sound_file_open
        ):
            c_ss.sound_file_open = False
            num_still_open += 1
            self.log.debug(f"Closing sound file for cached {c_uri=} {c_ss.age=}")
            c_ss.sound_file.close()
    self.log.debug(
        f"day_completed: closed {num_still_open} sound files that were still open."
    )

    # remove any downloaded files (cloud case):
    if not self.retain_downloaded_files:
        for c_ss in self.sound_cache.values():
            c_ss.remove_downloaded_file()

    self.sound_cache = {}

`extract_audio_segment(at_hour, at_minute, at_second, exclude_tone_calibration_seconds)` ¶

Extracts the audio segment at the given start time. For this it loads and aggregates the relevant audio segments.

Parameters:

Name	Type	Description	Default
`at_hour`	`int`	The hour when the audio segment was extracted.	required
`at_minute`	`int`	The minute when the audio segment was extracted.	required
`at_second`	`int`	The second when the audio segment was extracted.	required
`exclude_tone_calibration_seconds`	`Optional[int]`	If given and the resulting segment would overlap with the beginning of associated file, then such segment will not include the overlapping number of seconds.	required

Returns:

Type	Description
`Optional[ExtractedAudioSegment]`	ExtractedAudioSegment or None

Source code in pbp/hmb_gen/file_helper.py

def extract_audio_segment(
    self,
    at_hour: int,
    at_minute: int,
    at_second: int,
    exclude_tone_calibration_seconds: Optional[int],
) -> Optional[ExtractedAudioSegment]:
    """
    Extracts the audio segment at the given start time.
    For this it loads and aggregates the relevant audio segments.

    Args:
        at_hour (int): The hour when the audio segment was extracted.
        at_minute (int): The minute when the audio segment was extracted.
        at_second (int): The second when the audio segment was extracted.
        exclude_tone_calibration_seconds (Optional[int]): If given and the
            resulting segment would overlap with the beginning of associated file,
            then such segment will not include the overlapping number of seconds.

    Returns:
        ExtractedAudioSegment or None
    """

    assert self.json_entries is not None
    assert self.year is not None
    assert self.month is not None
    assert self.day is not None

    intersections = get_intersecting_entries(
        self.log,
        self.json_entries,
        self.year,
        self.month,
        self.day,
        at_hour,
        at_minute,
        at_second,
        segment_size_in_secs=self.segment_size_in_secs,
    )

    audio_info: Optional[AudioInfo] = None

    aggregated_segment: Optional[np.ndarray] = None

    prefix = f"({at_hour:02}h:{at_minute:02}m)"
    for i, intersection in enumerate(intersections):
        if intersection.duration_secs == 0:
            self.log.trace(
                f"{prefix}: No data from intersection {i} for {intersection.entry.uri}"
            )
            continue

        ss = self._get_sound_status(intersection.entry.uri)
        if ss.error is not None:
            return None

        self.log.debug(
            f"    {prefix} {intersection.duration_secs} secs from {ss.sound_filename}"
        )

        if audio_info is not None and not self._check_audio_info(
            audio_info, ss.audio_info
        ):
            return None  # error!

        audio_info = ss.audio_info

        start_secs = intersection.start_secs
        duration_secs = intersection.duration_secs

        if (
            exclude_tone_calibration_seconds is not None
            and exclude_tone_calibration_seconds > 0
        ):
            if start_secs < exclude_tone_calibration_seconds:
                # Sanity check:
                if (
                    exclude_tone_calibration_seconds
                    >= intersection.entry.duration_secs
                ):
                    self.log.warning(
                        f"!!! {exclude_tone_calibration_seconds=} "
                        f"exceeds {intersection.entry.duration_secs=}"
                    )
                    continue

                # `start_secs` is relative to the start of the file, so
                # exclude_tone_calibration_seconds takes effect here:
                diff_seconds = exclude_tone_calibration_seconds - start_secs
                start_secs = exclude_tone_calibration_seconds
                duration_secs -= diff_seconds

        start_sample = floor(start_secs * audio_info.samplerate)
        num_samples = ceil(duration_secs * audio_info.samplerate)

        try:
            new_pos = ss.sound_file.seek(start_sample)
            if new_pos != start_sample:
                # no-data case, let's just read 0 samples to get an empty array:
                audio_segment = ss.sound_file.read(0)
            else:
                audio_segment = ss.sound_file.read(num_samples)
                if len(audio_segment) < num_samples:
                    # partial-data case.
                    self.log.warning(
                        f"!!! partial data: {len(audio_segment)} < {num_samples}"
                    )

        except sf.LibsndfileError as e:
            self.log.exception(f"{e}")
            return None

        if aggregated_segment is None:
            aggregated_segment = audio_segment
        else:
            aggregated_segment = np.concatenate((aggregated_segment, audio_segment))

    if aggregated_segment is not None:
        assert audio_info is not None
        return ExtractedAudioSegment(audio_info, aggregated_segment)
    return None

File Helper

FileHelper¶

__init__(log, json_base_dir, audio_base_dir=None, audio_path_map_prefix='', audio_path_prefix='', segment_size_in_secs=60, s3_client=None, gs_client=None, download_dir=None, assume_downloaded_files=False, retain_downloaded_files=False, print_downloading_lines=False) ¶

select_day(year, month, day) ¶

get_local_filename(uri) ¶

day_completed() ¶

extract_audio_segment(at_hour, at_minute, at_second, exclude_tone_calibration_seconds) ¶

`init(log, json_base_dir, audio_base_dir=None, audio_path_map_prefix='', audio_path_prefix='', segment_size_in_secs=60, s3_client=None, gs_client=None, download_dir=None, assume_downloaded_files=False, retain_downloaded_files=False, print_downloading_lines=False)` ¶

`select_day(year, month, day)` ¶

`get_local_filename(uri)` ¶

`day_completed()` ¶

`extract_audio_segment(at_hour, at_minute, at_second, exclude_tone_calibration_seconds)` ¶