From 3d1769b4378d6bcd05ae29ee63a32c2cba89f9e5 Mon Sep 17 00:00:00 2001
From: maskedsyntax
@@ -206,7 +210,7 @@ generate_report(
--target COLUMN — target column for ML checks (class imbalance, leakage, etc.)--checks LIST — comma-separated list of checks to run--comparison FILE — compare with another dataset for drift detection--critical-only — hide warnings and show only critical issues--json — emit JSON instead of human-readable text--quiet — minimal output, useful in CI--sample-size N / --no-sample — control automatic sampling--config FILE — load thresholds from a YAML, TOML, or JSON file
hashprep details dataset.csv --target Survived
- Accepts the same options as scan and is best used when you are actively debugging a dataset
+ Accepts the same options as scan (including --comparison and --config) and is best used when you are actively debugging a dataset
or deciding which columns to drop or transform.
--with-code — write companion _fixes.py and _pipeline.py files--comparison FILE — compare two datasets for drift (train vs test, etc.)--sample-size N / --no-sample — control automatic sampling--config FILE — load thresholds from a YAML, TOML, or JSON filemissing_values — overall missingness patternsdataset_missingness — overall missing data patternshigh_missing_values — columns with heavy missingnessmissing_patterns — correlated missing value patternsduplicates — duplicate rowsempty_columns — completely empty columnsempty_dataset — empty or all-missing datasetssingle_value_columns — near-constant featuresmixed_data_types — columns with mixed data typesoutliers — IQR-based outlier detectionhigh_cardinality — categorical columns with too many uniquesoutliers — z-score outlier detectionskewness — highly skewed numeric distributionshigh_cardinality — too many unique categorical valuesuniform_distribution — uniformly distributed numeric columnsmany_zeros — features dominated by zerosunique_values — columns where >95% values are uniquehigh_zero_counts — features dominated by zerosinfinite_values — columns containing infinite valuesconstant_length — string columns with constant character lengthextreme_text_lengths — text columns with extreme value lengthsnormality — non-normal distributions (Shapiro-Wilk / D’Agostino-Pearson)variance_homogeneity — unequal variances across target groups (Levene’s test)class_imbalance — target imbalance (requires --target)feature_correlation — highly correlated featurestarget_leakage — features leaking target informationdataset_drift — drift between train / test datasetsfeature_correlation — highly correlated numeric featurescategorical_correlation — highly associated categorical featuresmixed_correlation — numeric-categorical associationsdata_leakage — columns identical to targettarget_leakage_patterns — features leaking target informationlow_mutual_information — near-zero MI with target (requires --target)dataset_drift — drift between train / test datasets (requires --comparison)datetime_skew — datetime columns concentrated in one perioddatetime_future_dates — datetime values in the futuredatetime_gaps — anomalous gaps in datetime sequencesdatetime_monotonicity — non-monotonic datetime columns+ Every detection threshold in HashPrep has a sensible default. You can override any of them at runtime by + providing a config file — no code changes required. +
+ +Config files can be written in YAML (.yaml / .yml), TOML (.toml),
+ or JSON (.json). Only the keys you specify are changed; everything else falls back to
+ defaults.
# hashprep.yaml
+missing_values:
+ warning: 0.3
+ critical: 0.6
+outliers:
+ z_score: 3.5
+statistical_tests:
+ normality_p_value: 0.01
+mutual_info:
+ low_mi_warning: 0.05
+ hashprep scan dataset.csv \
+ --config hashprep.yaml
+
+hashprep report dataset.csv \
+ --format html \
+ --config hashprep.toml
+ from hashprep.utils.config_loader import load_config
+from hashprep import DatasetAnalyzer
+
+config = load_config("hashprep.yaml")
+analyzer = DatasetAnalyzer(df, config=config)
+summary = analyzer.analyze()
+