Skip to content

Commit da2658d

Browse files
committed
feat(release): Prepare release for 0.4.0
1 parent 690decf commit da2658d

6 files changed

Lines changed: 234 additions & 8 deletions

File tree

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.3.0
1+
0.4.0

projects/holobrain/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
data/
2+
workspace/

robo_orchard_lab/dataset/datatypes/camera.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
from typing import Any, Literal
2020

2121
import datasets as hg_datasets
22+
import pyarrow as pa
23+
from datasets.table import array_cast
2224
from robo_orchard_core.datatypes.camera_data import (
2325
BatchCameraData,
2426
BatchCameraDataEncoded,
@@ -136,10 +138,50 @@ def __post_init__(self):
136138
"intrinsic_matrices": TypedTensorFeature(
137139
dtype=self.dtype, as_torch_tensor=True
138140
),
141+
"transform_matrices": TypedTensorFeature(
142+
dtype=self.dtype, as_torch_tensor=True
143+
),
139144
"distortion": DistortionFeature(dtype=self.dtype),
140145
"pose": BatchFrameTransformFeature(dtype=self.dtype),
141146
}
142147

148+
def cast_storage(self, storage: pa.StructArray) -> pa.StructArray:
149+
"""Cast the storage array to the expected schema.
150+
151+
When loading arrow table with old schema, we need to update
152+
the schema to current version.
153+
154+
Note:
155+
This method only handles missing fields. If the field
156+
type is changed, it will not be handled here!
157+
158+
"""
159+
# Cast the storage array to the expected schema
160+
storage_type: pa.StructType = storage.type
161+
feature_type: pa.StructType = self.pa_type
162+
# we only handle the case when storage is a struct array,
163+
# and leave other cases to the default array_cast implementation.
164+
if pa.types.is_struct(storage_type):
165+
if storage_type == feature_type:
166+
return storage
167+
# find all field
168+
existing_fields = set(storage_type.names)
169+
170+
# reconstruct storage with missing fields filled with null values
171+
arrays = []
172+
for field in feature_type:
173+
if field.name in existing_fields:
174+
arrays.append(storage.field(field.name))
175+
else:
176+
arrays.append(
177+
pa.array([None] * len(storage), type=field.type)
178+
)
179+
storage = pa.StructArray.from_arrays(
180+
arrays, names=feature_type.names, mask=storage.is_null()
181+
)
182+
# return storage
183+
return array_cast(storage, self.pa_type)
184+
143185

144186
@classmethod
145187
def _camera_data_encoded_dataset_feature(
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# Project RoboOrchard
2+
#
3+
# Copyright (c) 2024-2026 Horizon Robotics. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14+
# implied. See the License for the specific language governing
15+
# permissions and limitations under the License.
16+
import fsspec
17+
from datasets.arrow_dataset import (
18+
Dataset,
19+
DatasetInfo,
20+
InMemoryTable,
21+
MemoryMappedTable,
22+
Optional,
23+
Path,
24+
PathLike,
25+
Split,
26+
concat_tables,
27+
estimate_dataset_size,
28+
hf_tqdm,
29+
is_remote_filesystem,
30+
is_small_dataset,
31+
json,
32+
thread_map,
33+
)
34+
from fsspec import url_to_fs
35+
36+
__all__ = ["load_from_disk"]
37+
38+
39+
def load_from_disk(
40+
dataset_path: PathLike,
41+
keep_in_memory: Optional[bool] = None,
42+
storage_options: Optional[dict] = None,
43+
) -> Dataset:
44+
"""A wrapper around `datasets.load_from_disk`.
45+
46+
This method fix the issue when loading `Dataset` object when info.feature
47+
does not match the arrow table schema exactly. It is a bug in `Dataset`
48+
implementation, and we provide this wrapper to fix the issue.
49+
"""
50+
import posixpath
51+
52+
fs: fsspec.AbstractFileSystem
53+
fs, dataset_path = url_to_fs(dataset_path, **(storage_options or {}))
54+
import datasets.config as config
55+
56+
dest_dataset_path = dataset_path
57+
dataset_dict_json_path = posixpath.join(
58+
dest_dataset_path, # type: ignore
59+
config.DATASETDICT_JSON_FILENAME, # type: ignore
60+
)
61+
dataset_state_json_path = posixpath.join(
62+
dest_dataset_path, # type: ignore
63+
config.DATASET_STATE_JSON_FILENAME, # type: ignore
64+
)
65+
dataset_info_path = posixpath.join(
66+
dest_dataset_path, # type: ignore
67+
config.DATASET_INFO_FILENAME, # type: ignore
68+
)
69+
70+
dataset_dict_is_file = fs.isfile(dataset_dict_json_path)
71+
dataset_info_is_file = fs.isfile(dataset_info_path)
72+
dataset_state_is_file = fs.isfile(dataset_state_json_path)
73+
if not dataset_info_is_file and not dataset_state_is_file:
74+
if dataset_dict_is_file:
75+
raise FileNotFoundError(
76+
f"No such files: '{dataset_info_path}', nor '{dataset_state_json_path}' found. Expected to load a `Dataset` object, but got a `DatasetDict`. Please use either `datasets.load_from_disk` or `DatasetDict.load_from_disk` instead." # noqa: E501
77+
)
78+
raise FileNotFoundError(
79+
f"No such files: '{dataset_info_path}', nor '{dataset_state_json_path}' found. Expected to load a `Dataset` object but provided path is not a `Dataset`." # noqa: E501
80+
)
81+
if not dataset_info_is_file:
82+
if dataset_dict_is_file:
83+
raise FileNotFoundError(
84+
f"No such file: '{dataset_info_path}' found. Expected to load a `Dataset` object, but got a `DatasetDict`. Please use either `datasets.load_from_disk` or `DatasetDict.load_from_disk` instead." # noqa: E501
85+
)
86+
raise FileNotFoundError(
87+
f"No such file: '{dataset_info_path}'. Expected to load a `Dataset` object but provided path is not a `Dataset`." # noqa: E501
88+
)
89+
if not dataset_state_is_file:
90+
if dataset_dict_is_file:
91+
raise FileNotFoundError(
92+
f"No such file: '{dataset_state_json_path}' found. Expected to load a `Dataset` object, but got a `DatasetDict`. Please use either `datasets.load_from_disk` or `DatasetDict.load_from_disk` instead." # noqa: E501
93+
)
94+
raise FileNotFoundError(
95+
f"No such file: '{dataset_state_json_path}'. Expected to load a `Dataset` object but provided path is not a `Dataset`." # noqa: E501
96+
)
97+
98+
# copies file from filesystem if it is remote filesystem to local
99+
# filesystem and modifies dataset_path to temp directory
100+
# containing local copies
101+
if is_remote_filesystem(fs):
102+
src_dataset_path = dest_dataset_path
103+
dest_dataset_path = Dataset._build_local_temp_path(src_dataset_path) # type: ignore
104+
fs.download(
105+
src_dataset_path, dest_dataset_path.as_posix(), recursive=True
106+
)
107+
dataset_state_json_path = posixpath.join(
108+
dest_dataset_path, config.DATASET_STATE_JSON_FILENAME
109+
)
110+
dataset_info_path = posixpath.join(
111+
dest_dataset_path, config.DATASET_INFO_FILENAME
112+
)
113+
114+
with open(dataset_state_json_path, encoding="utf-8") as state_file:
115+
state = json.load(state_file)
116+
with open(dataset_info_path, encoding="utf-8") as dataset_info_file:
117+
dataset_info = DatasetInfo.from_dict(json.load(dataset_info_file))
118+
119+
dataset_size = estimate_dataset_size(
120+
Path(dest_dataset_path, data_file["filename"]) # type: ignore
121+
for data_file in state["_data_files"]
122+
)
123+
keep_in_memory = (
124+
keep_in_memory
125+
if keep_in_memory is not None
126+
else is_small_dataset(dataset_size)
127+
)
128+
table_cls = InMemoryTable if keep_in_memory else MemoryMappedTable
129+
130+
arrow_table = concat_tables(
131+
thread_map(
132+
table_cls.from_file,
133+
[
134+
posixpath.join(dest_dataset_path, data_file["filename"])
135+
for data_file in state["_data_files"]
136+
],
137+
tqdm_class=hf_tqdm,
138+
desc="Loading dataset from disk",
139+
# set `disable=None` rather than `disable=False` by default
140+
# to disable progress bar when no TTY attached
141+
disable=len(state["_data_files"]) <= 16 or None,
142+
)
143+
)
144+
145+
split = state["_split"]
146+
split = Split(split) if split is not None else split
147+
148+
if arrow_table.schema != dataset_info.features.arrow_schema: # type: ignore # noqa: E501
149+
arrow_table = arrow_table.cast(dataset_info.features.arrow_schema) # type: ignore # noqa: E501
150+
151+
dataset = Dataset(
152+
arrow_table=arrow_table,
153+
info=dataset_info,
154+
split=split,
155+
fingerprint=state["_fingerprint"],
156+
)
157+
158+
format = {
159+
"type": state["_format_type"],
160+
"format_kwargs": state["_format_kwargs"],
161+
"columns": state["_format_columns"],
162+
"output_all_columns": state["_output_all_columns"],
163+
}
164+
dataset = dataset.with_format(**format)
165+
166+
return dataset

robo_orchard_lab/dataset/robot/dataset.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import json
1919
import os
2020
import shutil
21+
import warnings
2122
from contextlib import contextmanager
2223
from dataclasses import asdict
2324
from typing import (
@@ -139,9 +140,24 @@ def __init__(
139140
meta_index2meta: bool = False,
140141
):
141142
dataset_path = os.path.expanduser(dataset_path)
142-
self.frame_dataset = HFDataset.load_from_disk(
143-
dataset_path, storage_options=storage_options
144-
)
143+
144+
try:
145+
self.frame_dataset = HFDataset.load_from_disk(
146+
dataset_path, storage_options=storage_options
147+
)
148+
except ValueError as e: # noqa
149+
from robo_orchard_lab.dataset.robot._hf_dataset import (
150+
load_from_disk,
151+
)
152+
153+
warnings.warn(
154+
"Failed to load dataset using `datasets.load_from_disk`. "
155+
"Falling back to use wrapped version. "
156+
)
157+
self.frame_dataset = load_from_disk(
158+
dataset_path, storage_options=storage_options
159+
)
160+
145161
self.index_dataset = self.frame_dataset.select_columns(
146162
column_names=list(PreservedIndexColumnsKeys)
147163
)

setup.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
LICENSE_HEADER = """# Project RoboOrchard
2828
#
29-
# Copyright (c) 2024-2025 Horizon Robotics. All Rights Reserved.
29+
# Copyright (c) 2024-2026 Horizon Robotics. All Rights Reserved.
3030
#
3131
# Licensed under the Apache License, Version 2.0 (the "License");
3232
# you may not use this file except in compliance with the License.
@@ -231,8 +231,8 @@ def replace_local_dependency(dep: str) -> str:
231231
# specific version.
232232
# Do not delete the line below, just comment it out and add the
233233
# new line.
234-
# "robo_orchard_core==0.2.0",
235-
"robo_orchard_core@git+https://github.com/HorizonRobotics/robo_orchard_core.git@094b4c60430cc2d34fa17d03fb3bcd174bfdc67e",
234+
"robo_orchard_core==0.4.0",
235+
# "robo_orchard_core@git+https://github.com/HorizonRobotics/robo_orchard_core.git@ff16649dc899b99bf1d24966d0a00f90a849fdce",
236236
]
237237
# optional dependencies
238238
extras_require = {
@@ -270,7 +270,7 @@ def replace_local_dependency(dep: str) -> str:
270270
"mcap>=1.2.2",
271271
"foxglove-schemas-protobuf>=0.3.0",
272272
"opencv-python",
273-
"robo_orchard_schemas==0.1.1",
273+
"robo_orchard_schemas==0.2.0",
274274
],
275275
"aux_think": [
276276
"transformers",

0 commit comments

Comments
 (0)