Dataset processing and training utilities for machine learning projects.
pip install -r requirements.txtscripts/
download/ # Dataset download scripts
extract/ # Data extraction scripts
analysis/ # Training analysis and evaluation
logs/ # Training logs and outputs
# Download FineWeb dataset
python scripts/download/download_fineweb.py --limit 1000 --output output.txt
# Download with wget scripts
bash scripts/download/wget_fineweb_1.sh# Extract from parquet files
python scripts/extract/extract_parquet.py
# Extract FineWeb data
python scripts/extract/extract_fineweb.py# Calculate training duration
python scripts/analysis/calculate_duration.py
# Evaluate training metrics
python scripts/analysis/evaluate.py --file logs/train_log_openweb.txt