Working with HDF5 Files¶
… read an HDF5 file with default settings¶
from r2x_core import DataFile
from pathlib import Path
# Reads the first dataset automatically
datafile = DataFile(
name="simple_data",
fpath=Path("data.h5"),
file_type="H5Format"
)
… read tabular HDF5 data with column names¶
# For files with separate data and column name datasets
datafile = DataFile(
name="tabular_data",
fpath=Path("tabular.h5"),
file_type="H5Format",
reader_kwargs={
"data_key": "values", # 2D data array
"columns_key": "col_names" # Column name strings
}
)
… read HDF5 time series data¶
# For files with datetime indices
datafile = DataFile(
name="timeseries",
fpath=Path("timeseries.h5"),
file_type="H5Format",
reader_kwargs={
"data_key": "data",
"columns_key": "columns",
"datetime_key": "timestamps",
"datetime_column_name": "timestamp"
}
)
… read HDF5 files with metadata fields¶
# Include additional datasets as columns
datafile = DataFile(
name="complex_data",
fpath=Path("complex.h5"),
file_type="H5Format",
reader_kwargs={
"data_key": "measurements",
"columns_key": "sensors",
"datetime_key": "time",
"additional_keys": ["location", "depth", "quality_flag"]
}
)
… configure HDF5 reading in JSON¶
{
"name": "load_data",
"fpath": "data/load.h5",
"file_type": "H5Format",
"reader_kwargs": {
"data_key": "data",
"columns_key": "columns",
"datetime_key": "index_datetime",
"additional_keys": ["index_year"]
}
}
… create a DataStore with multiple HDF5 files¶
{
"name": "my_datastore",
"datafiles": [
{
"name": "scientific_data",
"fpath": "experiments/results.h5",
"file_type": "H5Format",
"reader_kwargs": {
"data_key": "measurements",
"columns_key": "sensors",
"datetime_key": "timestamps",
"additional_keys": ["experiment_id", "lab_location"]
}
},
{
"name": "simple_data",
"fpath": "data/simple.h5",
"file_type": "H5Format"
}
]
}
… process HDF5 data with datetime filtering¶
from r2x_core import DataFile, DataStore
from pathlib import Path
import polars as pl
# Define file structure
datafile = DataFile(
name="load_data",
fpath=Path("load_data.h5"),
file_type="H5Format",
reader_kwargs={
"data_key": "data",
"columns_key": "columns",
"datetime_key": "index_datetime",
"additional_keys": ["index_year"]
}
)
# Read and filter
store = DataStore(name="loads", datafiles=[datafile])
df_lazy = store.read_file("load_data")
# Filter by year
df = df_lazy.filter(
pl.col("datetime").dt.year() == 2007
).collect()