Skip to content

Pandas

ts_bolt.datamodules.pandas¤

DataFrameDataset ¤

Bases: Dataset

A dataset from a pandas dataframe.

For a given pandas dataframe, this generates a pytorch compatible dataset by sliding in time dimension.

ds = DataFrameDataset(
    dataframe=df, history_length=10, horizon=2
)

Parameters:

Name Type Description Default
dataframe DataFrame

input dataframe with a DatetimeIndex.

required
history_length int

length of input X in time dimension in the final Dataset class.

required
horizon int

number of steps to be forecasted.

required
gap int

gap between input history and prediction

0
Source code in ts_bolt/datamodules/pandas.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
class DataFrameDataset(Dataset):
    """A dataset from a pandas dataframe.

    For a given pandas dataframe, this generates a pytorch
    compatible dataset by sliding in time dimension.

    ```python
    ds = DataFrameDataset(
        dataframe=df, history_length=10, horizon=2
    )
    ```

    :param dataframe: input dataframe with a DatetimeIndex.
    :param history_length: length of input X in time dimension
        in the final Dataset class.
    :param horizon: number of steps to be forecasted.
    :param gap: gap between input history and prediction
    """

    def __init__(
        self, dataframe: pd.DataFrame, history_length: int, horizon: int, gap: int = 0
    ):
        super().__init__()
        self.dataframe = dataframe
        self.history_length = history_length
        self.horzion = horizon
        self.gap = gap
        self.dataframe_rows = len(self.dataframe)
        self.length = (
            self.dataframe_rows - self.history_length - self.horzion - self.gap + 1
        )

    def moving_slicing(self, idx: int, gap: int = 0) -> Tuple[np.ndarray, np.ndarray]:
        x, y = (
            self.dataframe[idx : self.history_length + idx].values,
            self.dataframe[
                self.history_length
                + idx
                + gap : self.history_length
                + self.horzion
                + idx
                + gap
            ].values,
        )
        return x, y

    def _validate_dataframe(self) -> None:
        """Validate the input dataframe.

        - We require the dataframe index to be DatetimeIndex.
        - This dataset is null aversion.
        - Dataframe index should be sorted.
        """

        if not isinstance(
            self.dataframe.index, pd.core.indexes.datetimes.DatetimeIndex
        ):
            raise TypeError(
                "Type of the dataframe index is not DatetimeIndex"
                f": {type(self.dataframe.index)}"
            )

        has_na = self.dataframe.isnull().values.any()

        if has_na:
            logger.warning("Dataframe has null")

        has_index_sorted = self.dataframe.index.equals(
            self.dataframe.index.sort_values()
        )

        if not has_index_sorted:
            logger.warning("Dataframe index is not sorted")

    def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
        if isinstance(idx, slice):
            if (idx.start < 0) or (idx.stop >= self.length):
                raise IndexError(f"Slice out of range: {idx}")
            step = idx.step if idx.step is not None else 1
            return [
                self.moving_slicing(i, self.gap)
                for i in range(idx.start, idx.stop, step)
            ]
        else:
            if idx >= self.length:
                raise IndexError("End of dataset")
            return self.moving_slicing(idx, self.gap)

    def __len__(self) -> int:
        return self.length

_validate_dataframe() ¤

Validate the input dataframe.

  • We require the dataframe index to be DatetimeIndex.
  • This dataset is null aversion.
  • Dataframe index should be sorted.
Source code in ts_bolt/datamodules/pandas.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def _validate_dataframe(self) -> None:
    """Validate the input dataframe.

    - We require the dataframe index to be DatetimeIndex.
    - This dataset is null aversion.
    - Dataframe index should be sorted.
    """

    if not isinstance(
        self.dataframe.index, pd.core.indexes.datetimes.DatetimeIndex
    ):
        raise TypeError(
            "Type of the dataframe index is not DatetimeIndex"
            f": {type(self.dataframe.index)}"
        )

    has_na = self.dataframe.isnull().values.any()

    if has_na:
        logger.warning("Dataframe has null")

    has_index_sorted = self.dataframe.index.equals(
        self.dataframe.index.sort_values()
    )

    if not has_index_sorted:
        logger.warning("Dataframe index is not sorted")