Unit tests for shuffler#22
Conversation
|
|
||
|
|
||
| class DataLoaderShuffleTest(unittest.TestCase): | ||
| """This test suite validates core RayDMatrix functionality.""" |
There was a problem hiding this comment.
| """This test suite validates core RayDMatrix functionality.""" | |
| """This test suite validates core shuffle functionality.""" |
| def tearDownClass(cls): | ||
| ray.shutdown() | ||
|
|
||
| def testShuffleMap(self): |
There was a problem hiding this comment.
Nit: Tests should be snake case.
| def testShuffleMap(self): | |
| def test_shuffle_map(self): |
| assert len(set(all_keys)) == len(all_keys), \ | ||
| "Keys in full dataset are not distinct." | ||
|
|
||
| def testShuffleReduce(self): |
There was a problem hiding this comment.
Nit: Tests should be snake case.
| def testShuffleReduce(self): | |
| def test_shuffle_reduce(self): |
| assert set(unshuffled) == set(shuffled), \ | ||
| "Key mismatch between unshuffled and shuffled parts" | ||
|
|
||
| def testShuffleEndToEnd(self): |
There was a problem hiding this comment.
Nit: Tests should be snake case.
| def testShuffleEndToEnd(self): | |
| def test_shuffle_end_to_end(self): |
| # 3sd = 99.7% chance of passing | ||
| assert mean - 3 * sd < len(part_keys) < mean + 3 * sd, \ | ||
| f"Not enough rows in partition {i}" |
There was a problem hiding this comment.
Nice! How should we interpret the outliers when this assertion eventually fails?
| assert len(all_keys) == self.num_rows, "Not all rows were returned." | ||
|
|
||
| assert len(set(all_keys)) == len(all_keys), \ | ||
| "Keys in full dataset are not distinct." |
There was a problem hiding this comment.
This can wait, but we may want to confirm that none of the actual data was unintentionally mutated, e.g. due to type coercion. That would probably require a slight refactor of (or utility added to) the data generation code.
|
|
||
| shuffled = ray.get( | ||
| shuffle_reduce.remote( | ||
| 0, |
There was a problem hiding this comment.
It shouldn't matter much (I actually don't think it's even used in the reducer anymore), but maybe we should set the reducer_index here.
| 0, | |
| i, |
| for tid, epoch_batches in consumer.rank_epoch_batches.items(): | ||
| for i in range(len(epoch_batches) - 1): | ||
| assert len(epoch_batches[i]) == len( | ||
| epoch_batches[+1]) == num_epochs, \ |
There was a problem hiding this comment.
| epoch_batches[+1]) == num_epochs, \ | |
| epoch_batches[1]) == num_epochs, \ |
| "Keys in dataset are not distinct." | ||
|
|
||
| assert set1 == set2, \ | ||
| "Shuffled key sets are not equal." |
No description provided.