Bases: Dataset
A dataset from a pandas dataframe.
For a given pandas dataframe, this generates a pytorch
compatible dataset by sliding in time dimension.
ds = DataFrameDataset(
dataframe=df, history_length=10, horizon=2
)
Parameters:
Name |
Type |
Description |
Default |
dataframe |
DataFrame
|
input dataframe with a DatetimeIndex.
|
required
|
history_length |
int
|
length of input X in time dimension in the final Dataset class.
|
required
|
horizon |
int
|
number of steps to be forecasted.
|
required
|
gap |
int
|
gap between input history and prediction
|
0
|
Source code in ts_bolt/datamodules/pandas.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101 | class DataFrameDataset(Dataset):
"""A dataset from a pandas dataframe.
For a given pandas dataframe, this generates a pytorch
compatible dataset by sliding in time dimension.
```python
ds = DataFrameDataset(
dataframe=df, history_length=10, horizon=2
)
```
:param dataframe: input dataframe with a DatetimeIndex.
:param history_length: length of input X in time dimension
in the final Dataset class.
:param horizon: number of steps to be forecasted.
:param gap: gap between input history and prediction
"""
def __init__(
self, dataframe: pd.DataFrame, history_length: int, horizon: int, gap: int = 0
):
super().__init__()
self.dataframe = dataframe
self.history_length = history_length
self.horzion = horizon
self.gap = gap
self.dataframe_rows = len(self.dataframe)
self.length = (
self.dataframe_rows - self.history_length - self.horzion - self.gap + 1
)
def moving_slicing(self, idx: int, gap: int = 0) -> Tuple[np.ndarray, np.ndarray]:
x, y = (
self.dataframe[idx : self.history_length + idx].values,
self.dataframe[
self.history_length
+ idx
+ gap : self.history_length
+ self.horzion
+ idx
+ gap
].values,
)
return x, y
def _validate_dataframe(self) -> None:
"""Validate the input dataframe.
- We require the dataframe index to be DatetimeIndex.
- This dataset is null aversion.
- Dataframe index should be sorted.
"""
if not isinstance(
self.dataframe.index, pd.core.indexes.datetimes.DatetimeIndex
):
raise TypeError(
"Type of the dataframe index is not DatetimeIndex"
f": {type(self.dataframe.index)}"
)
has_na = self.dataframe.isnull().values.any()
if has_na:
logger.warning("Dataframe has null")
has_index_sorted = self.dataframe.index.equals(
self.dataframe.index.sort_values()
)
if not has_index_sorted:
logger.warning("Dataframe index is not sorted")
def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
if isinstance(idx, slice):
if (idx.start < 0) or (idx.stop >= self.length):
raise IndexError(f"Slice out of range: {idx}")
step = idx.step if idx.step is not None else 1
return [
self.moving_slicing(i, self.gap)
for i in range(idx.start, idx.stop, step)
]
else:
if idx >= self.length:
raise IndexError("End of dataset")
return self.moving_slicing(idx, self.gap)
def __len__(self) -> int:
return self.length
|
_validate_dataframe()
Validate the input dataframe.
- We require the dataframe index to be DatetimeIndex.
- This dataset is null aversion.
- Dataframe index should be sorted.
Source code in ts_bolt/datamodules/pandas.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84 | def _validate_dataframe(self) -> None:
"""Validate the input dataframe.
- We require the dataframe index to be DatetimeIndex.
- This dataset is null aversion.
- Dataframe index should be sorted.
"""
if not isinstance(
self.dataframe.index, pd.core.indexes.datetimes.DatetimeIndex
):
raise TypeError(
"Type of the dataframe index is not DatetimeIndex"
f": {type(self.dataframe.index)}"
)
has_na = self.dataframe.isnull().values.any()
if has_na:
logger.warning("Dataframe has null")
has_index_sorted = self.dataframe.index.equals(
self.dataframe.index.sort_values()
)
if not has_index_sorted:
logger.warning("Dataframe index is not sorted")
|