diff --git a/datafaker/main.py b/datafaker/main.py index baf2d49..ae89779 100644 --- a/datafaker/main.py +++ b/datafaker/main.py @@ -143,108 +143,169 @@ def main( conf_logger(verbose) -@app.command() -def create_data( - orm_file: Path = Option( - ORM_FILENAME, - help="The name of the ORM yaml file", - dir_okay=False, - ), - config_file: Optional[Path] = Option( - CONFIG_FILENAME, - help="The configuration file", +@app.command(rich_help_panel="1. Initialize and Check") +def make_tables( + orm_file: Path = Option(ORM_FILENAME, help="Path to write the ORM yaml file to"), + force: bool = Option( + False, "--force", "-f", help="Overwrite any existing orm yaml file." ), - stats_file: Optional[Path] = Option( + parquet_dir: Optional[Path] = Option( None, help=( - "Statistics file (output of make-stats); default is src-stats.yaml if the " - "config file references SRC_STATS, or None otherwise." + "Directory of Parquet files to consider part of the database." + " This can be useful when using DuckDB." + " Make sure you check the output!" ), - show_default=False, - dir_okay=False, + file_okay=False, + dir_okay=True, ), - num_passes: int = Option(1, help="Number of passes (rows or stories) to make"), ) -> None: - """Populate the schema in the target directory with synthetic data. + """Make a YAML file representing the tables in the schema. - This CLI command generates synthetic data for - Python table structures, and inserts these rows - into a destination schema. + Example: + $ datafaker make_tables + """ + logger.debug("Creating %s.", orm_file) - Also takes as input object relational model as represented - by file containing Python classes and its attributes. + orm_file_path = Path(orm_file) + if not force: + _check_file_non_existence(orm_file_path) - Takes as input datafaker output as represented by Python - classes, its attributes and methods for generating values - for those attributes. + content = make_tables_file( + get_source_dsn(), + get_source_schema(), + parquet_dir, + ) + orm_file_path.write_text(content, encoding="utf-8") + logger.debug("%s created.", orm_file) - Final input is the number of rows required. - Example: - $ datafaker create-data - """ - logger.debug("Creating data.") - config = read_config_file(config_file) if config_file is not None else {} - if stats_file is None and generators_require_stats(config): - stats_file = Path(STATS_FILENAME) - orm_metadata = load_metadata_for_output(orm_file, config) - try: - row_counts = create_db_data( - sorted_non_vocabulary_tables(orm_metadata, config), - config, - stats_file, - num_passes, - orm_metadata, - ) - logger.debug( - "Data created in %s %s.", - num_passes, - "pass" if num_passes == 1 else "passes", +@app.command(rich_help_panel="2. Configure Export") +def configure_tables( + config_file: Path = Option( + CONFIG_FILENAME, + help="Path to write the configuration file to", + dir_okay=False, + ), + orm_file: Path = Option( + ORM_FILENAME, + help="The name of the ORM yaml file", + dir_okay=False, + ), +) -> None: + """Interactively set tables to ignored, vocabulary or primary private.""" + logger.debug("Configuring tables in %s.", config_file) + config = {} + if config_file.exists(): + config = yaml.load( + config_file.read_text(encoding="UTF-8"), Loader=yaml.SafeLoader ) - for table_name, row_count in row_counts.items(): - logger.debug( - "%s: %s %s created.", - table_name, - row_count, - "row" if row_count == 1 else "rows", - ) + # we don't pass config here so that no tables are ignored + meta_dict = load_metadata_config(orm_file) + metadata = dict_to_metadata(meta_dict, None) + config_updated = update_config_tables( + get_source_dsn(), + get_source_schema(), + metadata, + config, + Path(meta_dict["parquet-dir"]) if "parquet-dir" in meta_dict else None, + ) + if config_updated is None: + logger.debug("Cancelled") return - except RuntimeError as e: - logger.error(e.args[0]) - except SettingsError as e: - logger.error(str(e)) - raise Exit(1) + content = yaml.dump(config_updated) + config_file.write_text(content, encoding="utf-8") + logger.debug("Tables configured in %s.", config_file) -@app.command() -def create_vocab( +@app.command(rich_help_panel="2. Configure Export") +def configure_generators( + config_file: Path = Option( + CONFIG_FILENAME, + help="Path of the configuration file to alter", + dir_okay=False, + ), orm_file: Path = Option( ORM_FILENAME, help="The name of the ORM yaml file", dir_okay=False, ), + spec: Path = Option( + None, + help=( + "CSV file (headerless) with fields table-name," + " column-name, generator-name to set non-interactively" + ), + ), +) -> None: + """Interactively set generators for column data.""" + logger.debug("Configuring generators in %s.", config_file) + config = {} + if config_file.exists(): + config = yaml.load( + config_file.read_text(encoding="UTF-8"), Loader=yaml.SafeLoader + ) + meta_dict = load_metadata_config(orm_file) + metadata = dict_to_metadata(meta_dict, None) + config_updated = update_config_generators( + DbCmd.Settings( + get_source_dsn(), + get_source_schema(), + config, + metadata, + meta_dict.get("parquet-dir", None), + ), + spec_path=spec, + ) + if config_updated is None: + logger.debug("Cancelled") + return + content = yaml.dump(config_updated) + config_file.write_text(content, encoding="utf-8") + logger.debug("Generators configured in %s.", config_file) + + +@app.command(rich_help_panel="2. Configure Export") +def configure_missing( config_file: Path = Option( CONFIG_FILENAME, - help="The configuration file", + help="Path to write the configuration file to", + dir_okay=False, + ), + orm_file: Path = Option( + ORM_FILENAME, + help="The name of the ORM yaml file", dir_okay=False, ), ) -> None: - """Import vocabulary data into the target database. - - Example: - $ datafaker create-vocab - """ - logger.debug("Loading vocab.") - config = read_config_file(config_file) if config_file is not None else {} + """Interactively set the missingness of the generated data.""" + logger.debug("Configuring missingness in %s.", config_file) + config: dict[str, Any] = {} + if config_file.exists(): + config_any = yaml.load( + config_file.read_text(encoding="UTF-8"), Loader=yaml.SafeLoader + ) + if isinstance(config_any, dict): + config = config_any meta_dict = load_metadata_config(orm_file, config) - orm_metadata = dict_to_metadata(meta_dict, config) - vocabs_loaded = create_db_vocab(orm_metadata, meta_dict, config) - num_vocabs = len(vocabs_loaded) - logger.debug("%s %s loaded.", num_vocabs, "table" if num_vocabs == 1 else "tables") + metadata = dict_to_metadata(meta_dict, None) + config_updated = update_missingness( + get_source_dsn(), + get_source_schema(), + metadata, + config, + Path(meta_dict["parquet-dir"]) if "parquet-dir" in meta_dict else None, + ) + if config_updated is None: + logger.debug("Cancelled") + return + content = yaml.dump(config_updated) + config_file.write_text(content, encoding="utf-8") + logger.debug("Missingness generators in %s.", config_file) -@app.command() -def create_tables( +@app.command(rich_help_panel="3. Export Intermediate Files") +def make_stats( orm_file: Path = Option( ORM_FILENAME, help="The name of the ORM yaml file", @@ -255,57 +316,33 @@ def create_tables( help="The configuration file", dir_okay=False, ), + stats_file: Path = Option(STATS_FILENAME), + force: bool = Option( + False, "--force", "-f", help="Overwrite any existing vocabulary file." + ), ) -> None: - """Create schema from the ORM YAML file. + """Compute summary statistics from the source database.""" + logger.debug("Creating %s.", stats_file) - This CLI command creates the destination schema using object - relational model declared as Python tables. + if not force: + _check_file_non_existence(stats_file) - Example: - $ datafaker create-tables - """ - logger.debug("Creating tables.") config = read_config_file(config_file) if config_file is not None else {} - orm_metadata = load_metadata_for_output(orm_file, config) - create_db_tables(orm_metadata) - logger.debug("Tables created.") - + meta_dict = load_metadata_config(orm_file, config) -@app.command() -def create_generators( - _orm_file: Path = Option( - ORM_FILENAME, - help="The name of the ORM yaml file", - dir_okay=False, - ), - _df_file: Path = Option( - None, - help="Path to write Python generators to.", - dir_okay=False, - ), - _config_file: Path = Option( - CONFIG_FILENAME, - help="The configuration file", - dir_okay=False, - ), - _stats_file: Optional[Path] = Option( - None, - help=( - "Statistics file (output of make-stats); default is src-stats.yaml if the " - "config file references SRC_STATS, or None otherwise." - ), - show_default=False, - dir_okay=False, - ), - _force: bool = Option( - False, "--force", "-f", help="Overwrite any existing Python generators file." - ), -) -> None: - """Obsolete command.""" - logger.error("This command is deprecated; it does nothing.") + src_stats = asyncio.get_event_loop().run_until_complete( + make_src_stats( + get_source_dsn(), + config, + get_source_schema(), + parquet_dir=meta_dict.get("parquet-dir", None), + ) + ) + stats_file.write_text(yaml.dump(src_stats), encoding="utf-8") + logger.debug("%s created.", stats_file) -@app.command() +@app.command(rich_help_panel="3. Export Intermediate Files") def make_vocab( orm_file: Path = Option( ORM_FILENAME, @@ -344,8 +381,8 @@ def make_vocab( ) -@app.command() -def make_stats( +@app.command(rich_help_panel="4. Create the Synthetic Database") +def create_tables( orm_file: Path = Option( ORM_FILENAME, help="The name of the ORM yaml file", @@ -356,191 +393,120 @@ def make_stats( help="The configuration file", dir_okay=False, ), - stats_file: Path = Option(STATS_FILENAME), - force: bool = Option( - False, "--force", "-f", help="Overwrite any existing vocabulary file." - ), ) -> None: - """Compute summary statistics from the source database.""" - logger.debug("Creating %s.", stats_file) - - if not force: - _check_file_non_existence(stats_file) - - config = read_config_file(config_file) if config_file is not None else {} - meta_dict = load_metadata_config(orm_file, config) - - src_stats = asyncio.get_event_loop().run_until_complete( - make_src_stats( - get_source_dsn(), - config, - get_source_schema(), - parquet_dir=meta_dict.get("parquet-dir", None), - ) - ) - stats_file.write_text(yaml.dump(src_stats), encoding="utf-8") - logger.debug("%s created.", stats_file) - + """Create schema from the ORM YAML file. -@app.command() -def make_tables( - orm_file: Path = Option(ORM_FILENAME, help="Path to write the ORM yaml file to"), - force: bool = Option( - False, "--force", "-f", help="Overwrite any existing orm yaml file." - ), - parquet_dir: Optional[Path] = Option( - None, - help=( - "Directory of Parquet files to consider part of the database." - " This can be useful when using DuckDB." - " Make sure you check the output!" - ), - file_okay=False, - dir_okay=True, - ), -) -> None: - """Make a YAML file representing the tables in the schema. + This CLI command creates the destination schema using object + relational model declared as Python tables. Example: - $ datafaker make_tables + $ datafaker create-tables """ - logger.debug("Creating %s.", orm_file) - - orm_file_path = Path(orm_file) - if not force: - _check_file_non_existence(orm_file_path) - - content = make_tables_file( - get_source_dsn(), - get_source_schema(), - parquet_dir, - ) - orm_file_path.write_text(content, encoding="utf-8") - logger.debug("%s created.", orm_file) - - -@app.command() -def configure_tables( - config_file: Path = Option( - CONFIG_FILENAME, - help="Path to write the configuration file to", - dir_okay=False, - ), - orm_file: Path = Option( - ORM_FILENAME, - help="The name of the ORM yaml file", - dir_okay=False, - ), -) -> None: - """Interactively set tables to ignored, vocabulary or primary private.""" - logger.debug("Configuring tables in %s.", config_file) - config = {} - if config_file.exists(): - config = yaml.load( - config_file.read_text(encoding="UTF-8"), Loader=yaml.SafeLoader - ) - # we don't pass config here so that no tables are ignored - meta_dict = load_metadata_config(orm_file) - metadata = dict_to_metadata(meta_dict, None) - config_updated = update_config_tables( - get_source_dsn(), - get_source_schema(), - metadata, - config, - Path(meta_dict["parquet-dir"]) if "parquet-dir" in meta_dict else None, - ) - if config_updated is None: - logger.debug("Cancelled") - return - content = yaml.dump(config_updated) - config_file.write_text(content, encoding="utf-8") - logger.debug("Tables configured in %s.", config_file) - - -@app.command() -def configure_missing( - config_file: Path = Option( - CONFIG_FILENAME, - help="Path to write the configuration file to", - dir_okay=False, - ), + logger.debug("Creating tables.") + config = read_config_file(config_file) if config_file is not None else {} + orm_metadata = load_metadata_for_output(orm_file, config) + create_db_tables(orm_metadata) + logger.debug("Tables created.") + + +@app.command(rich_help_panel="4. Create the Synthetic Database") +def create_vocab( orm_file: Path = Option( ORM_FILENAME, help="The name of the ORM yaml file", dir_okay=False, ), -) -> None: - """Interactively set the missingness of the generated data.""" - logger.debug("Configuring missingness in %s.", config_file) - config: dict[str, Any] = {} - if config_file.exists(): - config_any = yaml.load( - config_file.read_text(encoding="UTF-8"), Loader=yaml.SafeLoader - ) - if isinstance(config_any, dict): - config = config_any - meta_dict = load_metadata_config(orm_file, config) - metadata = dict_to_metadata(meta_dict, None) - config_updated = update_missingness( - get_source_dsn(), - get_source_schema(), - metadata, - config, - Path(meta_dict["parquet-dir"]) if "parquet-dir" in meta_dict else None, - ) - if config_updated is None: - logger.debug("Cancelled") - return - content = yaml.dump(config_updated) - config_file.write_text(content, encoding="utf-8") - logger.debug("Missingness generators in %s.", config_file) - - -@app.command() -def configure_generators( config_file: Path = Option( CONFIG_FILENAME, - help="Path of the configuration file to alter", + help="The configuration file", dir_okay=False, ), +) -> None: + """Import vocabulary data into the target database. + + Example: + $ datafaker create-vocab + """ + logger.debug("Loading vocab.") + config = read_config_file(config_file) if config_file is not None else {} + meta_dict = load_metadata_config(orm_file, config) + orm_metadata = dict_to_metadata(meta_dict, config) + vocabs_loaded = create_db_vocab(orm_metadata, meta_dict, config) + num_vocabs = len(vocabs_loaded) + logger.debug("%s %s loaded.", num_vocabs, "table" if num_vocabs == 1 else "tables") + + +@app.command(rich_help_panel="4. Create the Synthetic Database") +def create_data( orm_file: Path = Option( ORM_FILENAME, help="The name of the ORM yaml file", dir_okay=False, ), - spec: Path = Option( + config_file: Optional[Path] = Option( + CONFIG_FILENAME, + help="The configuration file", + ), + stats_file: Optional[Path] = Option( None, help=( - "CSV file (headerless) with fields table-name," - " column-name, generator-name to set non-interactively" + "Statistics file (output of make-stats); default is src-stats.yaml if the " + "config file references SRC_STATS, or None otherwise." ), + show_default=False, + dir_okay=False, ), + num_passes: int = Option(1, help="Number of passes (rows or stories) to make"), ) -> None: - """Interactively set generators for column data.""" - logger.debug("Configuring generators in %s.", config_file) - config = {} - if config_file.exists(): - config = yaml.load( - config_file.read_text(encoding="UTF-8"), Loader=yaml.SafeLoader - ) - meta_dict = load_metadata_config(orm_file) - metadata = dict_to_metadata(meta_dict, None) - config_updated = update_config_generators( - DbCmd.Settings( - get_source_dsn(), - get_source_schema(), + """Populate the schema in the target directory with synthetic data. + + This CLI command generates synthetic data for + Python table structures, and inserts these rows + into a destination schema. + + Also takes as input object relational model as represented + by file containing Python classes and its attributes. + + Takes as input datafaker output as represented by Python + classes, its attributes and methods for generating values + for those attributes. + + Final input is the number of rows required. + + Example: + $ datafaker create-data + """ + logger.debug("Creating data.") + config = read_config_file(config_file) if config_file is not None else {} + if stats_file is None and generators_require_stats(config): + stats_file = Path(STATS_FILENAME) + orm_metadata = load_metadata_for_output(orm_file, config) + try: + row_counts = create_db_data( + sorted_non_vocabulary_tables(orm_metadata, config), config, - metadata, - meta_dict.get("parquet-dir", None), - ), - spec_path=spec, - ) - if config_updated is None: - logger.debug("Cancelled") + stats_file, + num_passes, + orm_metadata, + ) + logger.debug( + "Data created in %s %s.", + num_passes, + "pass" if num_passes == 1 else "passes", + ) + for table_name, row_count in row_counts.items(): + logger.debug( + "%s: %s %s created.", + table_name, + row_count, + "row" if row_count == 1 else "rows", + ) return - content = yaml.dump(config_updated) - config_file.write_text(content, encoding="utf-8") - logger.debug("Generators configured in %s.", config_file) + except RuntimeError as e: + logger.error(e.args[0]) + except SettingsError as e: + logger.error(str(e)) + raise Exit(1) def convert_table_names_to_tables( @@ -612,73 +578,11 @@ def _dump_tables_to_directory( logger.warning("Failed to write %s", f) -@app.command() -def dump_data( - config_file: Optional[Path] = Option( - CONFIG_FILENAME, - help="Path of the configuration file to use", - dir_okay=False, - ), - orm_file: Path = Option( - ORM_FILENAME, - help="The name of the ORM yaml file", - dir_okay=False, - ), - table: list[str] = Option( - default=[], - help="The tables to dump (default is all non-ignored, non-vocabulary tables)", - ), - output: Path - | None = Option( - None, - help=( - "Output CSV or Parquet file name," - " directory to write into or - to output to the console" - ), - file_okay=True, - dir_okay=True, - ), - parquet: bool = Option( - False, - help="Use Parquet format (default use CSV unless --output specifies a .parquet file)", - ), -) -> None: - """Dump a whole table as a CSV file (or to the console) from the destination database.""" - directory = Path(".") - if output: - if Path(output).is_dir(): - directory = Path(output) - output = None - elif len(table) != 1: - logger.error( - "Must specify exactly one table if the output name is" - " specified, or specify an existing directory" - ) - raise Exit(1) - dst_dsn = get_destination_dsn() - schema_name = get_destination_schema() - config = read_config_file(config_file) if config_file is not None else {} - metadata = load_metadata_for_output(orm_file, config) - mtables = convert_table_names_to_tables(table, metadata) - if not mtables: - mtables = generated_tables(metadata, config) - if output == "-": - _dump_csv_to_stdout(mtables[0], metadata, dst_dsn, schema_name) - return - writer = _get_writer(parquet, output, metadata, dst_dsn, schema_name) - if output: - mtable = mtables[0] - if not writer.write_file(mtable, directory / output): - logger.error("Could not write table %s to file %s", mtable.name, output) - return - _dump_tables_to_directory(writer, directory, mtables) - - -@app.command() +@app.command(rich_help_panel="2. Configure Export") def validate_config( config_file: Path = Argument(help="The configuration file to validate"), ) -> None: - """Validate the format of a config file.""" + """Validate the format of a config file (useful if it has been edited by hand).""" logger.debug("Validating config file: %s.", config_file) config = yaml.load(config_file.read_text(encoding="UTF-8"), Loader=yaml.SafeLoader) @@ -691,7 +595,7 @@ def validate_config( logger.debug("Config file is valid.") -@app.command() +@app.command(rich_help_panel="5. Inspect Synthetic Data") def remove_data( orm_file: Path = Option( ORM_FILENAME, @@ -718,7 +622,7 @@ def remove_data( logger.info("Would truncate non-vocabulary tables if called with --yes.") -@app.command() +@app.command(rich_help_panel="5. Inspect Synthetic Data") def remove_vocab( orm_file: Path = Option( ORM_FILENAME, @@ -746,7 +650,7 @@ def remove_vocab( logger.info("Would truncate vocabulary tables if called with --yes.") -@app.command() +@app.command(rich_help_panel="5. Inspect Synthetic Data") def remove_tables( orm_file: Path = Option( ORM_FILENAME, @@ -797,7 +701,7 @@ class TableType(str, Enum): GENERATED = "generated" -@app.command() +@app.command(rich_help_panel="5. Inspect Synthetic Data") def list_tables( orm_file: Path = Option( ORM_FILENAME, @@ -830,7 +734,69 @@ def list_tables( print(name) -@app.command() +@app.command(rich_help_panel="5. Inspect Synthetic Data") +def dump_data( + config_file: Optional[Path] = Option( + CONFIG_FILENAME, + help="Path of the configuration file to use", + dir_okay=False, + ), + orm_file: Path = Option( + ORM_FILENAME, + help="The name of the ORM yaml file", + dir_okay=False, + ), + table: list[str] = Option( + default=[], + help="The tables to dump (default is all non-ignored, non-vocabulary tables)", + ), + output: Path + | None = Option( + None, + help=( + "Output CSV or Parquet file name," + " directory to write into or - to output to the console" + ), + file_okay=True, + dir_okay=True, + ), + parquet: bool = Option( + False, + help="Use Parquet format (default use CSV unless --output specifies a .parquet file)", + ), +) -> None: + """Dump a whole table as a CSV file (or to the console) from the destination database.""" + directory = Path(".") + if output: + if Path(output).is_dir(): + directory = Path(output) + output = None + elif len(table) != 1: + logger.error( + "Must specify exactly one table if the output name is" + " specified, or specify an existing directory" + ) + raise Exit(1) + dst_dsn = get_destination_dsn() + schema_name = get_destination_schema() + config = read_config_file(config_file) if config_file is not None else {} + metadata = load_metadata_for_output(orm_file, config) + mtables = convert_table_names_to_tables(table, metadata) + if not mtables: + mtables = generated_tables(metadata, config) + if output == "-": + _dump_csv_to_stdout(mtables[0], metadata, dst_dsn, schema_name) + return + writer = _get_writer(parquet, output, metadata, dst_dsn, schema_name) + if output: + mtable = mtables[0] + if not writer.write_file(mtable, directory / output): + logger.error("Could not write table %s to file %s", mtable.name, output) + return + _dump_tables_to_directory(writer, directory, mtables) + + +@app.command(rich_help_panel="1. Initialize and Check") def version() -> None: """Display version information.""" assert __package__ is not None