Samuel Duan
07/26/2024, 7:34 PMimport daft
import boto3
bucket_path = "<s3://my-bucket/sandbox/daft_hudi>"
def main():
session = boto3.session.Session()
creds = session.get_credentials()
io_config = daft.io.IOConfig(
s3=daft.io.S3Config(
access_key=creds.secret_key,
key_id=creds.access_key,
session_token=creds.token,
region_name="us-east-1",
)
)
df = daft.read_hudi(bucket_path, io_config=io_config).select('year', 'month', 'day')
print(df.schema())
df_pandas = df.to_pandas()
if __name__ == "__main__":
main()
jay
07/26/2024, 7:37 PMjay
07/26/2024, 7:49 PMyear/month/day
?Samuel Duan
07/26/2024, 8:11 PMTraceback (most recent call last):
File "path\test_daft.py", line 22, in <module>
main()
File "path\test_daft.py", line 19, in main
df_pandas = df.to_pandas()
File "Path\myenv\lib\site-packages\daft\api_annotations.py", line 26, in _wrap
return timed_method(*args, **kwargs)
File "Path\myenv\lib\site-packages\daft\analytics.py", line 189, in tracked_method
result = method(*args, **kwargs)
File "Path\myenv\lib\site-packages\daft\dataframe\dataframe.py", line 2193, in to_pandas
self.collect()
File "Path\myenv\lib\site-packages\daft\api_annotations.py", line 26, in _wrap
return timed_method(*args, **kwargs)
File "Path\myenv\lib\site-packages\daft\analytics.py", line 189, in tracked_method
result = method(*args, **kwargs)
File "Path\myenv\lib\site-packages\daft\dataframe\dataframe.py", line 2060, in collect
self._materialize_results()
File "Path\myenv\lib\site-packages\daft\dataframe\dataframe.py", line 2042, in _materialize_results
self._result_cache = context.runner().run(self._builder)
File "Path\myenv\lib\site-packages\daft\runners\pyrunner.py", line 143, in run
results = list(self.run_iter(builder))
File "Path\myenv\lib\site-packages\daft\runners\pyrunner.py", line 204, in run_iter
plan_scheduler = builder.to_physical_plan_scheduler(daft_execution_config)
File "Path\myenv\lib\site-packages\daft\logical\builder.py", line 47, in to_physical_plan_scheduler
return PhysicalPlanScheduler.from_logical_plan_builder(
File "Path\myenv\lib\site-packages\daft\plan_scheduler\physical_plan_scheduler.py", line 35, in from_logical_plan_builder
scheduler = _PhysicalPlanScheduler.from_logical_plan_builder(builder._builder, daft_execution_config)
File "Path\myenv\lib\site-packages\daft\hudi\hudi_scan.py", line 54, in to_scan_tasks
hudi_table_metadata: HudiTableMetadata = self._table.latest_table_metadata()
File "Path\myenv\lib\site-packages\daft\hudi\pyhudi\table.py", line 186, in latest_table_metadata
pa.RecordBatch.from_arrays(min_value_arrays, schema=colstats_schema),
File "pyarrow\\table.pxi", line 3429, in pyarrow.lib.RecordBatch.from_arrays
File "pyarrow\\table.pxi", line 1557, in pyarrow.lib._sanitize_arrays
ValueError: Schema and number of arrays unequal
jay
07/26/2024, 8:12 PMpyhudi
implementation… We’ll need to have the Hudi folks help us with that.Samuel Duan
07/26/2024, 8:12 PMjay
07/26/2024, 8:13 PMSamuel Duan
07/26/2024, 8:13 PMjay
07/26/2024, 8:13 PMpyhudi
implementation to a hudi-rs
one so this would be a good reason to do thatSamuel Duan
07/27/2024, 1:16 AMSamuel Duan
07/27/2024, 1:16 AM