Everett Kleven
09/23/2024, 11:20 PMKevin Wang
09/23/2024, 11:24 PMpa_table = pa.from_pylist([], schema) and then you can do daft.from_arrow(pa_table)Kevin Wang
09/23/2024, 11:24 PMEverett Kleven
09/23/2024, 11:39 PMimport pyarrow as pa
base_schema = pa.schema(
[
pa.field(
"id",
pa.string(),
nullable=False,
metadata={"description": "Unique identifier for the record"},
),
pa.field(
"created_at",
pa.timestamp("ns", tz="UTC"),
nullable=False,
metadata={"description": "Creation timestamp"},
),
pa.field(
"updated_at",
pa.timestamp("ns", tz="UTC"),
nullable=False,
metadata={"description": "Last update timestamp"},
),
pa.field(
"inserted_at",
pa.timestamp("ns", tz="UTC"),
nullable=False,
metadata={"description": "Insertion timestamp into the database"},
),
]
)
class BaseDF:
schema: ClassVar[pa.Schema] = base_schema
df: Optional[DataFrame] = daft.from_arrow(base_schema.empty_table())
@classmethod
def validate_schema(cls, df: DataFrame) -> DataFrame:
if not df.schema.to_pyarrow_schema().equals(cls.schema):
raise ValueError(f"DataFrame schema does not match the {cls.__name__} schema.")
return dfEverett Kleven
09/23/2024, 11:41 PMKevin Wang
09/23/2024, 11:44 PMEverett Kleven
09/23/2024, 11:45 PMself.schema: daft.Schema but since I am integrating with LanceDB and iceberg, I figured Arrow should be source of truthEverett Kleven
09/23/2024, 11:46 PM