-
Notifications
You must be signed in to change notification settings - Fork 2k
Fix massive spill files for StringView/BinaryView columns #19444
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
7652570
e1a18e5
a5e0a12
c1a03a8
eba362f
7f08877
1ae3bb1
c256ab2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -24,7 +24,7 @@ use arrow::array::RecordBatch; | |||
| use datafusion_common::exec_datafusion_err; | ||||
| use datafusion_execution::disk_manager::RefCountedTempFile; | ||||
|
|
||||
| use super::{IPCStreamWriter, spill_manager::SpillManager}; | ||||
| use super::{IPCStreamWriter, gc_view_arrays, spill_manager::SpillManager}; | ||||
|
|
||||
| /// Represents an in-progress spill file used for writing `RecordBatch`es to disk, created by `SpillManager`. | ||||
| /// Caller is able to use this struct to incrementally append in-memory batches to | ||||
|
|
@@ -50,6 +50,7 @@ impl InProgressSpillFile { | |||
| } | ||||
|
|
||||
| /// Appends a `RecordBatch` to the spill file, initializing the writer if necessary. | ||||
| /// Performs garbage collection on StringView/BinaryView arrays to reduce spill file size. | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I recommend to add more comments to explain the rationale for views gc, perhaps just copy and paste from
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @EeshanBembi could you enhance the comment here? |
||||
| /// | ||||
| /// # Errors | ||||
| /// - Returns an error if the file is not active (has been finalized) | ||||
|
|
@@ -61,8 +62,11 @@ impl InProgressSpillFile { | |||
| "Append operation failed: No active in-progress file. The file may have already been finalized." | ||||
| )); | ||||
| } | ||||
|
|
||||
| let gc_batch = gc_view_arrays(batch)?; | ||||
|
|
||||
| if self.writer.is_none() { | ||||
| let schema = batch.schema(); | ||||
| let schema = gc_batch.schema(); | ||||
| if let Some(in_progress_file) = &mut self.in_progress_file { | ||||
| self.writer = Some(IPCStreamWriter::new( | ||||
| in_progress_file.path(), | ||||
|
|
@@ -83,7 +87,7 @@ impl InProgressSpillFile { | |||
| } | ||||
| } | ||||
| if let Some(writer) = &mut self.writer { | ||||
| let (spilled_rows, _) = writer.write(batch)?; | ||||
| let (spilled_rows, _) = writer.write(&gc_batch)?; | ||||
| if let Some(in_progress_file) = &mut self.in_progress_file { | ||||
| let pre_size = in_progress_file.current_disk_usage(); | ||||
| in_progress_file.update_disk_usage()?; | ||||
|
|
||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is this unintentional change / git diff ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is intentional, arrow-data is added as a dev-dependency because the test helper calculate_string_view_waste_ratio uses arrow_data::MAX_INLINE_VIEW_LEN to determine whether a string value would be inlined in a StringView array.