diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..d31cf38 --- /dev/null +++ b/.env.example @@ -0,0 +1,10 @@ +# Environment Variables for ACCESS Project +# Copy this file to .env and fill in your actual values + +# Census API Key (Required for demographic analysis) +# Get your free API key at: https://api.census.gov/data/key_signup.html +# This key is used to download demographic data from the US Census Bureau +CENSUS_API_KEY=your_api_key_here + +# Note: The pipeline can run without this key, but the analysis step will be skipped +# If you have cached Census data, the API key is optional diff --git a/BACKLOG.md b/BACKLOG.md index c6dc55e..490d89f 100644 --- a/BACKLOG.md +++ b/BACKLOG.md @@ -9,6 +9,9 @@ - ✅ IMP-009: Enhanced Print Layouts (2025-11-15) - ✅ IMP-006: Webmap Enhancements (2025-11-09) - ✅ FR-003: Mobile-Friendly Webmap (2025-11-09) +- 🔄 TD-007: Error Handling Strategy - Partial (2025-11-15) +- 🔄 IMP-004: Improved Logging and Monitoring - Partial (2025-11-15) +- 🔄 IMP-003: Documentation Improvements - Partial (2025-11-15) This document consolidates technical debt, feature requests, and improvements identified through comprehensive project analysis. Items are categorized by type, priority, and estimated effort. @@ -98,7 +101,7 @@ Project uses OSMnx 1.3.0 (pinned), but latest stable version is 2.0+ (as of 2025 ### TD-003: Mixed Import Patterns for H3 Module **Priority:** Medium **Effort:** Small (4-8 hours) -**Status:** ✅ **COMPLETED** (2025-01-XX) +**Status:** ✅ **COMPLETED** (2025-11-15) **Category:** Code Quality **Description:** @@ -112,6 +115,8 @@ The `src/h3/` module used an inconsistent import pattern due to naming conflict 5. ✅ Removed mypy exclude for h3 module (no longer needed) 6. ✅ Updated pre-commit configuration +**Note:** Some legacy notebooks still use `from h3utils import *` (referring to `src/h3utils.py`, a separate utility file). The `src/h3_utils/` package directory is properly renamed and used throughout the main codebase. + **Files Modified:** - `src/h3_utils/` (renamed from `src/h3/`) - `src/run_pipeline.py` - Updated import @@ -261,7 +266,8 @@ Modern alternatives exist: ### TD-007: No Error Handling Strategy **Priority:** High -**Effort:** Medium (20-30 hours) +**Effort:** Medium (20-30 hours) → **12-18 hours remaining** +**Status:** 🔄 **IN PROGRESS** (2025-11-15) **Category:** Error Handling / Logging **Description:** @@ -272,6 +278,15 @@ Inconsistent error handling and logging across the codebase: - No error recovery mechanisms - Failed operations may leave partial data +**Progress (2025-11-15):** +- ✅ Fixed empty except blocks in `changelog.py` (2 locations) +- ✅ Fixed empty except blocks in `probe_data_sources.py` (2 locations) +- ✅ Added proper error logging with context messages +- ✅ Consistent logging patterns established (see DEVELOPMENT.md) +- ❌ Custom exception hierarchy not yet created +- ❌ Retry logic for network operations not yet implemented +- ❌ Pipeline validation checkpoints not yet added + **Examples of Issues:** - What happens if OSMnx graph download fails mid-process? - How are missing geometries handled in walk time calculations? @@ -284,15 +299,13 @@ Inconsistent error handling and logging across the codebase: - Data corruption risks - Poor user experience -**Solution:** -1. Define error handling strategy and patterns -2. Create custom exception hierarchy -3. Add comprehensive logging with levels (DEBUG, INFO, WARNING, ERROR) -4. Add validation checkpoints in pipeline -5. Implement retry logic for network operations -6. Add data validation before/after processing steps -7. Create error recovery guide for common failures -8. Add structured logging (JSON) for monitoring +**Remaining Work:** +1. ❌ Create custom exception hierarchy +2. ❌ Add validation checkpoints in pipeline +3. ❌ Implement retry logic for network operations +4. ❌ Add data validation before/after processing steps +5. ❌ Create error recovery guide for common failures +6. ❌ Add structured logging (JSON) for monitoring **Specific Improvements:** - Add transaction-like behavior for data updates @@ -783,7 +796,8 @@ Strengthen data validation throughout the pipeline. ### IMP-003: Documentation Improvements **Priority:** Medium -**Effort:** Medium (20-30 hours) +**Effort:** Medium (20-30 hours) → **16-25 hours remaining** +**Status:** 🔄 **IN PROGRESS** (2025-11-15) **Category:** Documentation **Description:** @@ -797,37 +811,48 @@ Enhance documentation for users, developers, and researchers. - Test README - Notebooks demonstrate workflows +**Progress (2025-11-15):** +- ✅ Created DEVELOPMENT.md with developer guidelines +- ✅ Documented logging best practices with code examples +- ✅ Documented library vs entry point patterns +- ✅ Documented TQDM integration +- ❌ .env.example not yet created (mentioned but file doesn't exist) +- ❌ No API documentation yet +- ❌ No auto-generated docs yet +- ❌ Contributing guidelines not yet created + **Improvements Needed:** 1. **API Documentation:** - - Auto-generated API docs (Sphinx/MkDocs) - - Module documentation - - Function signatures and examples - - Type hints throughout + - ❌ Auto-generated API docs (Sphinx/MkDocs) + - ❌ Module documentation + - ❌ Function signatures and examples + - ❌ Type hints throughout 2. **User Guides:** - - Step-by-step tutorials - - Common workflows - - Troubleshooting guide (expand existing) - - FAQ section + - ❌ Step-by-step tutorials + - ❌ Common workflows + - ❌ Troubleshooting guide (expand existing) + - ❌ FAQ section 3. **Developer Guides:** - - Contributing guidelines - - Code style guide - - Testing guide - - Release process + - ✅ Development best practices (DEVELOPMENT.md) + - ❌ Contributing guidelines (CONTRIBUTING.md) + - ❌ Code style guide + - ❌ Testing guide + - ❌ Release process 4. **Research Documentation:** - - Methodology documentation - - Algorithm descriptions - - Validation approach - - Reproducibility guide + - ❌ Methodology documentation + - ❌ Algorithm descriptions + - ❌ Validation approach + - ❌ Reproducibility guide 5. **Architecture Documentation:** - - System design - - Data flow diagrams (expand existing Mermaid) - - Module dependencies - - Extension points + - ❌ System design + - ❌ Data flow diagrams (expand existing Mermaid) + - ❌ Module dependencies + - ❌ Extension points **Tools:** - **Sphinx**: Python standard, autodoc @@ -835,18 +860,20 @@ Enhance documentation for users, developers, and researchers. - **Jupyter Book**: Integrate notebooks - **Mermaid**: Diagrams (already used) -**Implementation:** +**Remaining Work:** 1. Choose documentation tool 2. Set up documentation structure 3. Add docstrings throughout code -4. Write guides and tutorials -5. Deploy documentation site +4. Write CONTRIBUTING.md +5. Write guides and tutorials +6. Deploy documentation site --- ### IMP-004: Improved Logging and Monitoring **Priority:** Medium -**Effort:** Medium (16-24 hours) +**Effort:** Medium (16-24 hours) → **8-12 hours remaining** +**Status:** 🔄 **IN PROGRESS** (2025-11-15) **Category:** Observability **Description:** @@ -858,15 +885,29 @@ Enhance logging for better debugging and monitoring. - No structured logging - No centralized log aggregation +**Progress (2025-11-15):** +- ✅ Replaced print() statements with proper logging in library modules +- ⚠️ CLI scripts (`probe_data_sources.py`, `changelog.py`) still use print() for user-facing output (acceptable for CLI) +- ✅ Established consistent logging patterns: + - Entry scripts use `logging.basicConfig()` with handlers + - Library modules use `logger = logging.getLogger(__name__)` +- ✅ Created DEVELOPMENT.md with logging guidelines and examples +- ✅ Documented integration with TQDM progress bars +- ✅ Proper log levels used (DEBUG, INFO, WARNING, ERROR) +- ❌ No structured logging (JSON) yet +- ❌ No centralized log aggregation yet +- ❌ No monitoring dashboards yet + **Improvements:** 1. **Structured Logging:** - - JSON format for machine parsing - - Consistent log levels - - Context information (user, region, operation) - - Request IDs for tracing + - ❌ JSON format for machine parsing + - ✅ Consistent log levels + - ❌ Context information (user, region, operation) + - ❌ Request IDs for tracing 2. **Log Levels:** + - ✅ Properly applied throughout codebase ```python DEBUG: Detailed diagnostic info INFO: General informational messages @@ -876,29 +917,28 @@ Enhance logging for better debugging and monitoring. ``` 3. **Performance Logging:** - - Operation timing - - Resource usage - - Progress tracking - - Bottleneck identification + - ❌ Operation timing + - ❌ Resource usage + - ❌ Progress tracking + - ❌ Bottleneck identification 4. **Log Management:** - - Log rotation - - Compression - - Retention policy - - Search and analysis + - ❌ Log rotation + - ❌ Compression + - ❌ Retention policy + - ❌ Search and analysis 5. **Monitoring:** - - Metrics collection (Prometheus) - - Dashboards (Grafana) - - Alerting - - Health checks + - ❌ Metrics collection (Prometheus) + - ❌ Dashboards (Grafana) + - ❌ Alerting + - ❌ Health checks -**Implementation:** -1. Add `structlog` library -2. Create logging configuration -3. Update all modules to use structured logging -4. Set up log aggregation -5. Create monitoring dashboards +**Remaining Work:** +1. Add `structlog` library for structured logging +2. Add performance/timing logging +3. Set up log rotation and management +4. Create monitoring dashboards (optional) --- @@ -1452,13 +1492,38 @@ For questions or to contribute: --- -**Document Version:** 1.3 -**Last Updated:** 2025-11-09 -**Previous Version:** 1.2 (2025-11-09) +**Document Version:** 1.4.1 +**Last Updated:** 2025-11-15 +**Previous Version:** 1.4 (2025-11-15) **Analysis Method:** Comprehensive codebase review, dependency analysis, and best practices research **Revision Notes:** +**v1.4.1 (2025-11-15):** +- Accuracy verification: Reviewed all status indicators against actual codebase +- Corrected IMP-003: .env.example not yet created (was incorrectly marked as completed) +- Clarified IMP-004: Print statements in CLI scripts are acceptable for user-facing output +- Updated TD-003: Fixed completion date placeholder and added note about legacy notebooks +- Verified TD-009, IMP-005, IMP-006, FR-003, IMP-009 completion status (all accurate) + +**v1.4 (2025-11-15):** +- Updated TD-007 (Error Handling Strategy) - marked as IN PROGRESS + - Fixed 4 empty except blocks with proper error logging + - Documented progress and remaining work +- Updated IMP-004 (Improved Logging and Monitoring) - marked as IN PROGRESS + - Replaced print() statements with proper logging in library modules + - CLI scripts still use print() for user-facing output (acceptable) + - Established consistent logging patterns + - Created DEVELOPMENT.md with logging guidelines +- Updated IMP-003 (Documentation Improvements) - marked as IN PROGRESS + - Created DEVELOPMENT.md with developer best practices + - Corrected: .env.example not yet created (was incorrectly marked as completed) +- Updated TD-003 (H3 Module Import Pattern) - corrected completion date from placeholder + - Added note about legacy notebooks using separate h3utils.py file +- Updated effort estimates for in-progress items +- Added recent completions section +- Verified accuracy of all status indicators against codebase + **v1.3 (2025-11-09):** - Added TD-011: H3 Not Used as Primary Geographic Unit (technical debt) - Added FR-004: Complete H3 Implementation as Primary Geographic Unit (feature request) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md new file mode 100644 index 0000000..8f2cc29 --- /dev/null +++ b/DEVELOPMENT.md @@ -0,0 +1,119 @@ +# Development Guide + +This document provides guidelines for contributors working on the ACCESS codebase. + +## Logging Best Practices + +This codebase uses Python's standard `logging` module. Follow these patterns for consistency: + +### For Library Modules (files imported by other code) + +Use `logging.getLogger(__name__)` WITHOUT calling `basicConfig()`: + +```python +import logging + +logger = logging.getLogger(__name__) + +def my_function(): + logger.info("Processing data...") + logger.warning("Data quality issue detected") + logger.error("Failed to process file") +``` + +**Examples:** `src/walk_times/calculate.py`, `src/h3utils.py`, `src/merging/analysis.py` + +### For Entry Point Scripts (standalone scripts with `if __name__ == "__main__"`) + +Use `logging.basicConfig()` with handlers, then get a logger: + +```python +import logging + +# Set up logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('data/my_script_log.txt'), # Optional: log to file + logging.StreamHandler() # Log to console + ] +) +logger = logging.getLogger(__name__) + +def main(): + logger.info("Starting processing...") + # ... your code ... +``` + +**Examples:** `src/run_pipeline.py`, `src/update_data_sources.py`, `src/convert_to_pmtiles.py` + +### Log Levels + +Choose appropriate log levels: + +- `logger.debug()` - Detailed diagnostic information (not shown by default) +- `logger.info()` - General informational messages about progress +- `logger.warning()` - Warning messages (something unexpected but not fatal) +- `logger.error()` - Error messages (operation failed but script continues) +- `logger.critical()` - Critical errors (script must exit) + +### Integration with TQDM Progress Bars + +When using TQDM for progress indication, logging works seamlessly: + +```python +from tqdm import tqdm +import logging + +logger = logging.getLogger(__name__) + +def process_items(items): + logger.info(f"Processing {len(items)} items") + for item in tqdm(items, desc="Processing"): + # TQDM will show progress bar + # logger messages will appear above the progress bar + if item.needs_attention(): + logger.warning(f"Issue with item {item.id}") +``` + +**Examples:** `src/walk_times/calculate.py`, `src/walk_times/algorithms.py` + +### Error Handling + +Always log exceptions properly: + +```python +# Good - logs error with details +try: + process_data(file) +except ValueError as e: + logger.error(f"Invalid data in {file}: {e}") + +# Bad - silent failure +try: + process_data(file) +except ValueError: + pass +``` + +## Environment Variables + +See `.env.example` for required environment variables. Copy it to `.env` and fill in your values: + +```bash +cp .env.example .env +# Edit .env with your actual values +``` + +## Testing + +- Write tests for new functionality in `tests/` +- Run tests with: `pytest tests/` +- See existing test files for examples + +## Code Style + +- Follow PEP 8 guidelines +- Use type hints where practical +- Add docstrings to public functions and classes diff --git a/src/changelog.py b/src/changelog.py index b9c579a..9da1ccc 100755 --- a/src/changelog.py +++ b/src/changelog.py @@ -161,8 +161,9 @@ def create_notification( try: with open(notifications_file) as f: notifications = json.load(f) - except (OSError, json.JSONDecodeError): - pass + except (OSError, json.JSONDecodeError) as e: + logging.warning(f"Failed to load notifications file: {e}. Starting with empty list.") + notifications = [] notifications.append(notification) @@ -217,8 +218,8 @@ def mark_notification_read(notification_timestamp: str): with open(notifications_file, "w") as f: json.dump(notifications, f, indent=2, default=str) - except (OSError, json.JSONDecodeError): - pass + except (OSError, json.JSONDecodeError) as e: + logging.error(f"Failed to update notification read status: {e}") def main(): diff --git a/src/convert_to_pmtiles.py b/src/convert_to_pmtiles.py index ae3365f..eeef51b 100755 --- a/src/convert_to_pmtiles.py +++ b/src/convert_to_pmtiles.py @@ -7,12 +7,21 @@ """ import argparse +import logging import shutil import subprocess # nosec B404 import sys import tempfile from pathlib import Path +# Set up logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler()], +) +logger = logging.getLogger(__name__) + def check_command(command: str) -> bool: """Check if a command is available.""" @@ -50,13 +59,13 @@ def convert_to_geojson( # Save as GeoJSON gdf.to_file(output_path, driver="GeoJSON") - print(f"Converted {input_path} to {output_path}") + logger.info(f"Converted {input_path} to {output_path}") return True except ImportError: - print("Error: geopandas not found. Please install geopandas.") + logger.error("geopandas not found. Please install geopandas.") return False except Exception as e: - print(f"Error converting {input_path}: {e}") + logger.error(f"Error converting {input_path}: {e}") return False @@ -83,7 +92,7 @@ def convert_to_pmtiles( True if successful, False otherwise """ if not check_command("tippecanoe"): - print("Error: tippecanoe not found. Please install tippecanoe (v2.17+).") + logger.error("tippecanoe not found. Please install tippecanoe (v2.17+).") return False # Check tippecanoe version supports PMTiles @@ -92,9 +101,9 @@ def convert_to_pmtiles( ["tippecanoe", "--version"], capture_output=True, text=True, check=True ) version_str = version_result.stdout.strip() - print(f"Using tippecanoe: {version_str}") + logger.info(f"Using tippecanoe: {version_str}") except subprocess.CalledProcessError: - print("Warning: Could not check tippecanoe version") + logger.warning("Could not check tippecanoe version") cmd = [ "tippecanoe", @@ -118,10 +127,10 @@ def convert_to_pmtiles( try: subprocess.run(cmd, check=True, capture_output=True, text=True) # nosec B603 - print(f"Converted {geojson_path} to {output_path}") + logger.info(f"Converted {geojson_path} to {output_path}") return True except subprocess.CalledProcessError as e: - print(f"Error converting to PMTiles: {e.stderr}") + logger.error(f"Error converting to PMTiles: {e.stderr}") return False @@ -186,16 +195,16 @@ def main(): args = parser.parse_args() if not args.input.exists(): - print(f"Error: Input file {args.input} does not exist") + logger.error(f"Input file {args.input} does not exist") sys.exit(1) success = convert_file(args.input, args.output, args.layer, args.min_zoom, args.max_zoom) if success: - print(f"Successfully created {args.output}") + logger.info(f"Successfully created {args.output}") sys.exit(0) else: - print(f"Failed to create {args.output}") + logger.error(f"Failed to create {args.output}") sys.exit(1) diff --git a/src/download_graphs.py b/src/download_graphs.py index f165e3a..8ee5fb0 100644 --- a/src/download_graphs.py +++ b/src/download_graphs.py @@ -1,9 +1,19 @@ +import logging + import osmnx as ox +# Set up logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler()], +) +logger = logging.getLogger(__name__) + ox.settings.cache_folder = "./cache/" ox.settings.log_console = True -print("Using OSMnx version", ox.__version__) -print("WARNING: This script requires >10GB RAM available") +logger.info(f"Using OSMnx version {ox.__version__}") +logger.warning("This script requires >10GB RAM available") # download/model a network of driving routes for the state of Maine G = ox.graph_from_place({"state": "Maine"}, network_type="drive") diff --git a/src/h3utils.py b/src/h3utils.py index b395b47..c87b9b6 100644 --- a/src/h3utils.py +++ b/src/h3utils.py @@ -1,10 +1,13 @@ import json +import logging from pathlib import Path import matplotlib import pandas as pd from jsonschema import validate +logger = logging.getLogger(__name__) + try: from .config.defaults import DEFAULT_H3_RESOLUTION from .config.regions import RegionConfig @@ -48,13 +51,13 @@ def h3_merge(df, reln=None, inplace=False, resolution=None, region_config=None): # Summarize a given column by h3 fraction def h3_weight(df, col, prefix="h3_"): - print(f"Creating {prefix + col}") + logger.info(f"Creating {prefix + col}") df[prefix + col] = df[col] * df["h3_fraction"] # Summarize a given column by h3 fraction, further weighting by population fraction def h3_weight_pop(df, col, prefix="h3_"): - print(f"Creating {prefix + col}") + logger.info(f"Creating {prefix + col}") df[prefix + col] = df[col] * df["P1_001N"] * df["h3_fraction"] @@ -64,7 +67,7 @@ def h3_plot(df, col: str, lognorm=True, inplace=False, **plot_kwargs): if "h3id" not in df.index.names: df = h3_merge(df) if not col.startswith("h3_"): - print(f"Interpreting '{col}' as 'h3_{col}'") + logger.info(f"Interpreting '{col}' as 'h3_{col}'") col = "h3_" + col if col not in df.columns: h3_weight(df, col[3:]) diff --git a/src/probe_data_sources.py b/src/probe_data_sources.py index 995a5a1..8ef5910 100755 --- a/src/probe_data_sources.py +++ b/src/probe_data_sources.py @@ -72,10 +72,12 @@ def get_remote_file_date(url: str) -> datetime | None: except (ValueError, TypeError) as e: # Invalid date format - log and continue import logging + logging.debug(f"Could not parse Last-Modified header '{last_modified}': {e}") except Exception as e: # Network or other error - log and continue import logging + logging.debug(f"Error getting remote file date from {url}: {e}") return None diff --git a/src/validate_data.py b/src/validate_data.py index dcb1418..1bcf114 100755 --- a/src/validate_data.py +++ b/src/validate_data.py @@ -123,7 +123,9 @@ def compare_schemas(old_schema: dict, new_schema: dict) -> dict[str, Any]: old_type = old_dtypes.get(col) new_type = new_dtypes.get(col) if old_type and new_type and old_type != new_type: - changes["type_changes"].append({"column": col, "old_type": old_type, "new_type": new_type}) + changes["type_changes"].append( + {"column": col, "old_type": old_type, "new_type": new_type} + ) # Check row count change old_count = old_schema.get("row_count", 0) @@ -154,7 +156,8 @@ def compare_schemas(old_schema: dict, new_schema: dict) -> dict[str, Any]: def validate_data_quality( - file_path: Path, schema: dict | None = None # noqa: ARG001 + file_path: Path, + schema: dict | None = None, # noqa: ARG001 ) -> dict[str, Any]: """Validate data quality metrics.""" quality_metrics: dict[str, Any] = { @@ -336,7 +339,9 @@ def validate_data_file(file_path: Path, source_name: str | None = None) -> dict[ if schema_changes["crs_change"]: crs_change = schema_changes["crs_change"] - validation_result["warnings"].append(f"CRS changed: {crs_change['old']} -> {crs_change['new']}") + validation_result["warnings"].append( + f"CRS changed: {crs_change['old']} -> {crs_change['new']}" + ) # Save current schema version if source_name not in schema_versions: @@ -355,10 +360,14 @@ def validate_data_file(file_path: Path, source_name: str | None = None) -> dict[ # Check for quality issues if quality_metrics["invalid_geometries"] > 0: - validation_result["warnings"].append(f"{quality_metrics['invalid_geometries']} invalid geometries") + validation_result["warnings"].append( + f"{quality_metrics['invalid_geometries']} invalid geometries" + ) if quality_metrics["empty_geometries"] > 0: - validation_result["warnings"].append(f"{quality_metrics['empty_geometries']} empty geometries") + validation_result["warnings"].append( + f"{quality_metrics['empty_geometries']} empty geometries" + ) high_missing = { col: metrics @@ -366,7 +375,9 @@ def validate_data_file(file_path: Path, source_name: str | None = None) -> dict[ if metrics["percent"] > 10 } if high_missing: - warnings.append(f"High missing values (>10%): {', '.join(high_missing.keys())}") + validation_result["warnings"].append( + f"High missing values (>10%): {', '.join(high_missing.keys())}" + ) # Check coordinate system crs_valid, crs_message = check_coordinate_system_consistency(file_path)