diff --git a/.clang-format b/.clang-format new file mode 100644 index 000000000..e623d7ac7 --- /dev/null +++ b/.clang-format @@ -0,0 +1,83 @@ +--- +# BasedOnStyle: LLVM +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlinesLeft: false +AlignOperands: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Empty +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: true +BinPackArguments: false +BinPackParameters: false +BreakAfterJavaFieldAnnotations: false +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeTernaryOperators: false +BreakConstructorInitializersBeforeComma: false +BreakStringLiterals: true +ColumnLimit: 120 +CommentPragmas: '^ IWYU pragma:' +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +FixNamespaceComments: true +ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + - Regex: '^(<|"(gtest|isl|json)/)' + Priority: 3 + - Regex: '.*' + Priority: 1 +IncludeIsMainRegex: '$' +IndentCaseLabels: true +IndentWidth: 4 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBlockIndentWidth: 4 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 10000000 +PointerAlignment: Left +RawStringFormats: + - Delimiters: [pb] + Language: TextProto + BasedOnStyle: google +ReflowComments: true +SortIncludes: true +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: false +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +TabWidth: 4 +UseTab: Never +... diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 000000000..39271bca7 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,229 @@ +--- +Checks: > + boost-use-to-string, + bugprone-*, + -bugprone-easily-swappable-parameters, + -bugprone-implicit-widening-of-multiplication-result, + -bugprone-narrowing-conversions, + cert-err34-c, + cert-flp30-c, + cert-msc32-c, + cert-msc50-cpp, + cert-msc51-cpp, + clang-diagnostic-*, + cppcoreguidelines-interfaces-global-init, + cppcoreguidelines-no-malloc, + cppcoreguidelines-pro-type-static-cast-downcast, + cppcoreguidelines-pro-type-union-access, + cppcoreguidelines-slicing, + google-build-namespaces, + google-explicit-constructor, + google-global-names-in-headers, + google-readability-casting, + google-runtime-member-string-references, + google-runtime-memset, + hicpp-exception-baseclass, + misc-*, + -misc-confusable-identifiers, + -misc-const-correctness, + -misc-no-recursion, + modernize-*, + -modernize-avoid-c-arrays, + -modernize-use-trailing-return-type, + performance-*, + readability-*, + -readability-avoid-const-params-in-decls, + -readability-function-cognitive-complexity, + -readability-container-contains, + -readability-identifier-length, + -readability-magic-numbers, + +# WarningsAsErrors and HeaderFilterRegex work together: +# HeaderFilterRegex scopes which headers clang-tidy reports diagnostics for. +# Warnings from headers outside the regex (PyTorch, pybind11, etc.) are suppressed +# entirely and never reach WarningsAsErrors — so the large warning counts printed +# by clang-tidy ("N warnings generated") are third-party noise that is silently +# dropped. Only diagnostics in our own headers (.*/gigl/csrc/.*) are reported, +# and those are treated as hard errors. +WarningsAsErrors: '*' +HeaderFilterRegex: '.*/gigl/csrc/.*' +FormatStyle: none +User: jenkins +# CheckOptions: per-check tuning parameters. Each entry configures a specific +# option for an individual check, using the form: +# key: . +# value: +# These let you adjust thresholds, naming patterns, and behavior without +# enabling or disabling the check entirely. +CheckOptions: + - key: bugprone-argument-comment.StrictMode + value: '0' + - key: bugprone-assert-side-effect.AssertMacros + value: 'assert,SC_ASSERT' + - key: bugprone-assert-side-effect.CheckFunctionCalls + value: '0' + - key: bugprone-dangling-handle.HandleClasses + value: 'std::basic_string_view;std::experimental::basic_string_view' + - key: bugprone-string-constructor.LargeLengthThreshold + value: '8388608' + - key: bugprone-string-constructor.WarnOnLargeLength + value: '1' + - key: cppcoreguidelines-pro-type-member-init.IgnoreArrays + value: '1' + - key: google-global-names-in-headers.HeaderFileExtensions + value: ',h,hh,hpp,hxx' + - key: google-readability-function-size.StatementThreshold + value: '800' + - key: google-readability-namespace-comments.ShortNamespaceLines + value: '10' + - key: google-readability-namespace-comments.SpacesBeforeComments + value: '2' + - key: misc-definitions-in-headers.HeaderFileExtensions + value: ',h,hh,hpp,hxx' + - key: misc-definitions-in-headers.UseHeaderFileExtension + value: '1' + - key: misc-misplaced-widening-cast.CheckImplicitCasts + value: '0' + - key: misc-sizeof-expression.WarnOnSizeOfCompareToConstant + value: '1' + - key: misc-sizeof-expression.WarnOnSizeOfConstant + value: '1' + - key: misc-sizeof-expression.WarnOnSizeOfThis + value: '1' + - key: misc-suspicious-enum-usage.StrictMode + value: '0' + - key: misc-suspicious-missing-comma.MaxConcatenatedTokens + value: '5' + - key: misc-suspicious-missing-comma.RatioThreshold + value: '0.200000' + - key: misc-suspicious-missing-comma.SizeThreshold + value: '5' + - key: misc-suspicious-string-compare.StringCompareLikeFunctions + value: '' + - key: misc-suspicious-string-compare.WarnOnImplicitComparison + value: '1' + - key: misc-suspicious-string-compare.WarnOnLogicalNotComparison + value: '0' + - key: misc-throw-by-value-catch-by-reference.CheckThrowTemporaries + value: '1' + - key: modernize-loop-convert.MaxCopySize + value: '16' + - key: modernize-loop-convert.MinConfidence + value: reasonable + - key: modernize-loop-convert.NamingStyle + value: CamelCase + - key: modernize-loop-convert.UseCxx20ReverseRanges + value: '0' + - key: modernize-make-unique.IgnoreMacros + value: '1' + - key: modernize-make-unique.IncludeStyle + value: 'llvm' + - key: modernize-make-unique.MakeSmartPtrFunction + value: 'std::make_unique' + - key: modernize-make-unique.MakeSmartPtrFunctionHeader + value: memory + - key: modernize-pass-by-value.IncludeStyle + value: llvm + - key: modernize-replace-auto-ptr.IncludeStyle + value: llvm + - key: modernize-use-emplace.ContainersWithPushBack + value: '::std::vector;::std::list;::std::deque' + - key: modernize-use-emplace.SmartPointers + value: '::std::shared_ptr;::std::unique_ptr;::std::auto_ptr;::std::weak_ptr' + - key: modernize-use-emplace.TupleMakeFunctions + value: '::std::make_pair;::std::make_tuple' + - key: modernize-use-emplace.TupleTypes + value: '::std::pair;::std::tuple' + - key: modernize-use-noexcept.ReplacementString + value: '' + - key: modernize-use-noexcept.UseNoexceptFalse + value: '1' + - key: modernize-use-nullptr.NullMacros + value: 'NULL' + - key: modernize-use-transparent-functors.SafeMode + value: '0' + - key: performance-faster-string-find.StringLikeClasses + value: 'std::basic_string' + - key: performance-for-range-copy.WarnOnAllAutoCopies + value: '0' + - key: performance-inefficient-string-concatenation.StrictMode + value: '0' + - key: performance-inefficient-vector-operation.VectorLikeClasses + value: '::std::vector' + - key: performance-move-const-arg.CheckTriviallyCopyableMove + value: '1' + - key: performance-move-constructor-init.IncludeStyle + value: llvm + - key: performance-type-promotion-in-math-fn.IncludeStyle + value: llvm + - key: readability-braces-around-statements.ShortStatementLines + value: '0' + - key: readability-function-size.BranchThreshold + value: '4294967295' + - key: readability-function-size.LineThreshold + value: '1000' + - key: readability-function-size.NestingThreshold + value: '4294967295' + - key: readability-function-size.ParameterThreshold + value: '4294967295' + - key: readability-function-size.StatementThreshold + value: '800' + - key: readability-identifier-naming.ClassCase + value: CamelCase + - key: readability-identifier-naming.ClassConstantPrefix + value: k + - key: readability-identifier-naming.ClassConstantCase + value: CamelCase + - key: readability-identifier-naming.ClassMemberCase + value: camelBack + - key: readability-identifier-naming.ConstexprVariableCase + value: CamelCase + - key: readability-identifier-naming.ConstexprVariablePrefix + value: k + - key: readability-identifier-naming.EnumCase + value: CamelCase + - key: readability-identifier-naming.EnumConstantCase + value: CamelCase + - key: readability-identifier-naming.FunctionCase + value: camelBack + - key: readability-identifier-naming.GlobalConstantPrefix + value: k + - key: readability-identifier-naming.GlobalConstantCase + value: CamelCase + - key: readability-identifier-naming.IgnoreFailedSplit + value: '0' + - key: readability-identifier-naming.LocalConstantCase + value: camelBack + - key: readability-identifier-naming.MemberCase + value: camelBack + - key: readability-identifier-naming.MethodCase + value: camelBack + - key: readability-identifier-naming.ParameterCase + value: camelBack + - key: readability-identifier-naming.PrivateMemberCase + value: camelBack + - key: readability-identifier-naming.PrivateMemberPrefix + value: _ + - key: readability-identifier-naming.ProtectedMemberCase + value: camelBack + - key: readability-identifier-naming.ProtectedMemberPrefix + value: _ + - key: readability-identifier-naming.PublicMemberCase + value: camelBack + - key: readability-identifier-naming.TemplateParameterCase + value: camelBack + - key: readability-identifier-naming.TypeTemplateParameterCase + value: CamelCase + - key: readability-identifier-naming.UnionCase + value: CamelCase + - key: readability-identifier-naming.VariableCase + value: camelBack + - key: readability-implicit-bool-conversion.AllowPointerConditions + value: '1' + - key: readability-simplify-boolean-expr.ChainedConditionalAssignment + value: '0' + - key: readability-simplify-boolean-expr.ChainedConditionalReturn + value: '0' + - key: readability-static-accessed-through-instance.NameSpecifierNestingThreshold + value: '3' +... diff --git a/.gitignore b/.gitignore index e06a57bc8..da3c883c5 100644 --- a/.gitignore +++ b/.gitignore @@ -48,3 +48,6 @@ fossa*.zip # https://github.com/google-github-actions/auth/issues/497 gha-creds-*.json + +# Compiled C++ extension modules +gigl/csrc/**/*.so diff --git a/Makefile b/Makefile index e15a063f3..170ab45ba 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,7 @@ DOCKER_IMAGE_MAIN_CPU_NAME_WITH_TAG?=${DOCKER_IMAGE_MAIN_CPU_NAME}:${DATE} DOCKER_IMAGE_DEV_WORKBENCH_NAME_WITH_TAG?=${DOCKER_IMAGE_DEV_WORKBENCH_NAME}:${DATE} PYTHON_DIRS:=.github/scripts examples gigl tests snapchat scripts +CPP_SOURCES:=$(shell find gigl/csrc -name "*.cpp" 2>/dev/null) PY_TEST_FILES?="*_test.py" # You can override GIGL_TEST_DEFAULT_RESOURCE_CONFIG by setting it in your environment i.e. # adding `export GIGL_TEST_DEFAULT_RESOURCE_CONFIG=your_resource_config` to your shell config (~/.bashrc, ~/.zshrc, etc.) @@ -49,6 +50,7 @@ install_dev_deps: check_if_valid_env gcloud auth configure-docker us-central1-docker.pkg.dev bash ./requirements/install_py_deps.sh --dev bash ./requirements/install_scala_deps.sh + bash ./requirements/install_cpp_deps.sh uv pip install -e . uv run pre-commit install --hook-type pre-commit --hook-type pre-push @@ -75,7 +77,7 @@ assert_yaml_configs_parse: # Ex. `make unit_test_py PY_TEST_FILES="eval_metrics_test.py"` # By default, runs all tests under tests/unit. # See the help text for "--test_file_pattern" in tests/test_args.py for more details. -unit_test_py: clean_build_files_py type_check +unit_test_py: clean_build_files_py build_cpp_extensions type_check uv run python -m tests.unit.main \ --env=test \ --resource_config_uri=${GIGL_TEST_DEFAULT_RESOURCE_CONFIG} \ @@ -94,7 +96,12 @@ unit_test_scala: clean_build_files_scala # Eventually, we should look into splitting these up. # We run `make check_format` separately instead of as a dependent make rule so that it always runs after the actual testing. # We don't want to fail the tests due to non-conformant formatting during development. -unit_test: precondition_tests unit_test_py unit_test_scala +unit_test_cpp: + cmake -S tests/unit/cpp -B build/cpp_tests + cmake --build build/cpp_tests --parallel + ctest --test-dir build/cpp_tests --output-on-failure + +unit_test: precondition_tests unit_test_py unit_test_scala unit_test_cpp check_format_py: uv run autoflake --check --config pyproject.toml ${PYTHON_DIRS} @@ -109,13 +116,16 @@ check_format_md: @echo "Checking markdown files..." uv run mdformat --check ${MD_FILES} -check_format: check_format_py check_format_scala check_format_md +check_format_cpp: + $(if $(CPP_SOURCES), clang-format --dry-run --Werror --style=file $(CPP_SOURCES)) + +check_format: check_format_py check_format_cpp check_format_scala check_format_md # Set PY_TEST_FILES= to test a specifc file. # Ex. `make integration_test PY_TEST_FILES="dataflow_test.py"` # By default, runs all tests under tests/integration. # See the help text for "--test_file_pattern" in tests/test_args.py for more details. -integration_test: +integration_test: build_cpp_extensions uv run python -m tests.integration.main \ --env=test \ --resource_config_uri=${GIGL_TEST_DEFAULT_RESOURCE_CONFIG} \ @@ -143,12 +153,30 @@ format_md: @echo "Formatting markdown files..." uv run mdformat ${MD_FILES} -format: format_py format_scala format_md +format_cpp: + $(if $(CPP_SOURCES), clang-format -i --style=file $(CPP_SOURCES)) + +format: format_py format_cpp format_scala format_md type_check: uv run mypy ${PYTHON_DIRS} --check-untyped-defs -lint_test: check_format assert_yaml_configs_parse +build_cpp_extensions: + uv run --no-sync python -m scripts.build_cpp_extensions build_ext --inplace + +generate_compile_commands: + uv run python -m scripts.generate_compile_commands + +check_lint_cpp: + $(if $(CPP_SOURCES), uv run python -m scripts.run_cpp_lint $(CPP_SOURCES)) + +# Not part of `make format`: clang-tidy --fix rewrites logic (renames identifiers, +# changes expressions, adds/removes keywords), not just style. Run manually and +# review the diff before committing. +fix_lint_cpp: generate_compile_commands + $(if $(CPP_SOURCES), clang-tidy --fix -p build/compile_commands.json $(CPP_SOURCES)) + +lint_test: check_format assert_yaml_configs_parse check_lint_cpp @echo "Lint checks pass!" # compiles current working state of scala projects to local jars @@ -262,7 +290,7 @@ run_all_e2e_tests: # Example: # `make compiled_pipeline_path="/tmp/gigl/my_pipeline.yaml" compile_gigl_kubeflow_pipeline` # Can be a GCS URI as well -compile_gigl_kubeflow_pipeline: compile_jars push_new_docker_images +compile_gigl_kubeflow_pipeline: build_cpp_extensions compile_jars push_new_docker_images uv run python -m gigl.orchestration.kubeflow.runner \ --action=compile \ --container_image_cuda=${DOCKER_IMAGE_MAIN_CUDA_NAME_WITH_TAG} \ @@ -288,7 +316,7 @@ _skip_build_deps: # job_name=... \ , and other params # compiled_pipeline_path="/tmp/gigl/my_pipeline.yaml" \ # run_dev_gnn_kubeflow_pipeline -run_dev_gnn_kubeflow_pipeline: $(if $(compiled_pipeline_path), _skip_build_deps, compile_jars push_new_docker_images) +run_dev_gnn_kubeflow_pipeline: $(if $(compiled_pipeline_path), _skip_build_deps, build_cpp_extensions compile_jars push_new_docker_images) uv run python -m gigl.orchestration.kubeflow.runner \ $(if $(compiled_pipeline_path),,--container_image_cuda=${DOCKER_IMAGE_MAIN_CUDA_NAME_WITH_TAG}) \ $(if $(compiled_pipeline_path),,--container_image_cpu=${DOCKER_IMAGE_MAIN_CPU_NAME_WITH_TAG}) \ @@ -313,7 +341,10 @@ clean_build_files_scala: ( cd scala; sbt clean; find . -type d -name "target" -prune -exec rm -rf {} \; ) ( cd scala_spark35; sbt clean; find . -type d -name "target" -prune -exec rm -rf {} \; ) -clean_build_files: clean_build_files_py clean_build_files_scala +clean_build_files_cpp: + rm -rf build/ + +clean_build_files: clean_build_files_py clean_build_files_scala clean_build_files_cpp # Call to generate new proto definitions if any of the .proto files have been changed. # We intentionally rebuild *all* protos with one commmand as they should all be in sync. diff --git a/containers/Dockerfile.src b/containers/Dockerfile.src index b80295962..7b105084a 100644 --- a/containers/Dockerfile.src +++ b/containers/Dockerfile.src @@ -15,8 +15,10 @@ COPY uv.lock uv.lock COPY gigl/dep_vars.env gigl/dep_vars.env COPY deployment deployment COPY gigl gigl +COPY scripts scripts COPY snapchat snapchat COPY tests tests COPY examples examples RUN uv pip install -e . +RUN uv run python scripts/build_cpp_extensions.py build_ext --inplace diff --git a/docs/cpp_style_guide.md b/docs/cpp_style_guide.md new file mode 100644 index 000000000..33d3a2144 --- /dev/null +++ b/docs/cpp_style_guide.md @@ -0,0 +1,153 @@ +# C++ Style Guide + +GiGL enforces C++ style automatically via two tools: + +- **clang-format** (`.clang-format`) — code formatting +- **clang-tidy** (`.clang-tidy`) — static analysis and lint + +All clang-tidy warnings are treated as errors. + +## Running the Tools + +```bash +make format_cpp # Format all C++ files in-place +make check_lint_cpp # Run clang-tidy static analysis +``` + +______________________________________________________________________ + +## Formatting (`.clang-format`) + +The style is based on LLVM with the following notable deviations: + +### Line length + +``` +ColumnLimit: 120 +``` + +120 columns rather than the LLVM default of 80. ML and graph code tends to have longer identifiers and nested template +types; 120 gives enough room without forcing awkward wraps. + +### Indentation and braces + +``` +IndentWidth: 4 +BreakBeforeBraces: Attach # K&R / "same-line" style +UseTab: Never +IndentCaseLabels: true # case labels indented inside switch +NamespaceIndentation: None # namespace bodies not indented +``` + +### Pointer and reference alignment + +``` +PointerAlignment: Left +``` + +Pointers bind to the type, not the name: `int* x`, not `int *x`. + +### Parameter and argument wrapping + +``` +BinPackArguments: false +BinPackParameters: false +``` + +When a function call or declaration doesn't fit on one line, every argument/parameter gets its own line. Mixed +"bin-packing" (some on one line, some wrapped) is not allowed. + +### Templates + +``` +AlwaysBreakTemplateDeclarations: true +``` + +`template <...>` always appears on its own line, keeping the declaration signature visually separate from the template +header. + +### Include ordering + +Includes are sorted and split into three priority groups: + +| Priority | Pattern | Group | +| -------- | ------------------------------------ | ------------------------------------- | +| 1 | `.*` | Local project headers (first) | +| 2 | `^"(llvm\|llvm-c\|clang\|clang-c)/"` | LLVM/Clang internal headers | +| 3 | `^(<\|"(gtest\|isl\|json)/)` | System and third-party headers (last) | + +### Raw string formatting + +Raw string literals with the `pb` delimiter (e.g. `R"pb(...)pb"`) are formatted as TextProto using Google style, +matching the protobuf idiom used throughout the codebase. + +______________________________________________________________________ + +## Static Analysis (`.clang-tidy`) + +### Check philosophy + +A broad set of check families is enabled to catch bugs, enforce modern C++ idioms, and maintain readability. All +warnings are errors — there is no "warning-only" category. + +Enabled families: + +| Family | What it covers | +| --------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | +| `boost-use-to-string` | Prefer `std::to_string` over `boost::lexical_cast` for numeric conversions | +| `bugprone-*` | Common programming mistakes: dangling handles, suspicious string construction, assert side effects, etc. | +| `cert-*` | CERT secure coding rules for error handling (`err34-c`), floating-point loops (`flp30-c`), and RNG seeding (`msc32-c`, `msc50/51-cpp`) | +| `clang-diagnostic-*` | Compiler diagnostic warnings surfaced as lint checks (e.g. `-Wall`, `-Wextra` violations) | +| `cppcoreguidelines-*` | C++ Core Guidelines: no raw `malloc`, no union member access, no object slicing, safe downcasts | +| `google-*` | Google C++ style: explicit constructors, no global names in headers, safe `memset` usage | +| `hicpp-exception-baseclass` | All thrown exceptions must derive from `std::exception` | +| `misc-*` | Miscellaneous: header-only definitions, suspicious enum usage, throw-by-value/catch-by-reference, etc. | +| `modernize-*` | Modernize to C++11/14/17: `nullptr`, range-based for, `make_unique`, `using` aliases, etc. | +| `performance-*` | Unnecessary copies, inefficient string ops, missed `emplace`, type promotions in math functions | +| `readability-*` | Naming conventions, braces around statements, boolean simplification, function size limits | + +### Disabled checks + +Some checks in the above families are disabled where they produce excessive noise or conflict with common patterns in +this codebase: + +| Disabled check | Reason | +| ----------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `bugprone-easily-swappable-parameters` | Tensor and sampler APIs legitimately have many adjacent same-typed parameters | +| `bugprone-implicit-widening-of-multiplication-result` | Crashes clang-tidy 15 on a construct in `ATen/core/dynamic_type.h` (upstream LLVM bug). Re-enable when upgrading past clang-tidy 15. | +| `bugprone-narrowing-conversions` | Too noisy in ML code mixing `int`/`int64_t`/`size_t` for tensor dimensions | +| `misc-confusable-identifiers` | Performs an O(n²) comparison of all identifiers in scope to detect Unicode homoglyphs. PyTorch headers introduce thousands of identifiers, making this check account for ~70% of total lint time. All identifiers in this codebase are standard ASCII. | +| `misc-const-correctness` | Produces false positives with pybind11 types whose mutation happens through `operator[]` (which is non-const). The check incorrectly suggests `const` on variables that are mutated. | +| `misc-no-recursion` | Recursive graph algorithms are intentional | +| `modernize-avoid-c-arrays` | C arrays are needed for pybind11 and C-interop code | +| `modernize-use-trailing-return-type` | Trailing return types (`auto f() -> T`) are only useful when the return type depends on template params. Requiring them everywhere is non-standard and reduces readability. | +| `readability-avoid-const-params-in-decls` | Incorrectly fires on `const T&` parameters in multi-line declarations (clang-tidy 15 bug). The check is meant for top-level const on by-value params, which is a separate, valid concern. | +| `readability-container-contains` | `.contains()` requires C++20; the codebase builds with C++17 | +| `readability-identifier-length` | Short loop variables (`i`, `j`, `k`) are idiomatic | +| `readability-function-cognitive-complexity` | Algorithmic code often requires nesting that is inherent to the problem structure. Enforcing an arbitrary complexity ceiling discourages clarity and encourages artificial decomposition. | +| `readability-magic-numbers` | Literal constants are common in ML code (e.g. feature dimensions) | + +### Naming conventions + +Enforced via `readability-identifier-naming`: + +| Identifier kind | Convention | Example | +| --------------------------------------------------------- | --------------------------- | ----------------- | +| Classes, enums, unions | `CamelCase` | `DistDataset` | +| Type template parameters | `CamelCase` | `NodeType` | +| Functions, methods | `camelBack` | `sampleNeighbors` | +| Variables, parameters, members | `camelBack` | `numNodes` | +| Private/protected members | `camelBack` with `_` prefix | `_nodeFeatures` | +| Constants (`constexpr`, `const` globals, class constants) | `CamelCase` with `k` prefix | `kMaxBatchSize` | + +### Key option tuning + +| Option | Value | Effect | +| ---------------------------------------------------------- | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `WarningsAsErrors` | `*` | Every check failure is a hard error in CI | +| `HeaderFilterRegex` | `.*/gigl/csrc/.*` | Scopes checks to our own headers. Using `.*` causes clang-tidy to report warnings from every PyTorch/pybind11 header it parses, flooding output with thousands of third-party issues. | +| `FormatStyle` | `none` | clang-tidy does not auto-reformat; use clang-format separately | +| `bugprone-string-constructor.LargeLengthThreshold` | `8388608` (8 MB) | Strings larger than 8 MB from a length argument are flagged | +| `modernize-loop-convert.NamingStyle` | `CamelCase` | Auto-generated loop variable names use CamelCase | +| `readability-function-size.LineThreshold` | `1000` | Functions over 1000 lines are flagged | +| `readability-braces-around-statements.ShortStatementLines` | `0` | Braces required for all control-flow bodies, even single-line | diff --git a/gigl/csrc/__init__.py b/gigl/csrc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/gigl/scripts/post_install.py b/gigl/scripts/post_install.py index cd13c4b0b..bafc4ae21 100644 --- a/gigl/scripts/post_install.py +++ b/gigl/scripts/post_install.py @@ -3,64 +3,56 @@ Once GiGL is installed w/ `pip install gigl`, this script can be executed by running: `gigl-post-install` -This script is used to install the dependencies for GIGL. -- Currently, it installs GLT by running install_glt.sh. +This script is used to install the dependencies for GIGL: +- Installs GLT by running install_glt.sh. +- Builds pybind11 C++ extensions in-place so they are importable without a separate build step. """ import subprocess import sys from pathlib import Path -from typing import Optional - - -def run_command_and_stream_stdout(cmd: str) -> Optional[int]: - """ - Executes a command and streams the stdout output. - - Args: - cmd (str): The command to be executed. - - Returns: - Optional[int]: The return code of the command, or None if the command failed to execute. - """ - process = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) - while True: - output = process.stdout.readline() # type: ignore - if output == b"" and process.poll() is not None: - break - if output: - print(output.strip()) - return_code: Optional[int] = process.poll() - return return_code def main(): """Main entry point for the post-install script.""" print("Running GIGL post-install script...") - # Get the directory where this script is located script_dir = Path(__file__).parent + repo_root = script_dir.parent.parent - # Path to the install_glt.sh script + # Step 1: Install GLT install_glt_script = script_dir / "install_glt.sh" - if not install_glt_script.exists(): print(f"Error: install_glt.sh not found at {install_glt_script}") sys.exit(1) - cmd = f"bash {install_glt_script}" - try: - print(f"Executing {cmd}...") - result = run_command_and_stream_stdout(cmd) - print("Post-install script finished running, with return code: ", result) - return result - + print(f"Installing GLT via {install_glt_script}...") + subprocess.run(["bash", str(install_glt_script)], check=True) + print("GLT install finished.") except subprocess.CalledProcessError as e: - print(f"Error running install_glt.sh: {e}") + print(f"Error installing GLT: {e}") + sys.exit(1) + + # Step 2: Build pybind11 C++ extensions in-place so they are importable + # without requiring a separate `make build_cpp_extensions` call. + # subprocess.run streams stdout/stderr to the terminal and raises + # CalledProcessError on a non-zero exit code. + build_cpp_script = repo_root / "scripts" / "build_cpp_extensions.py" + if not build_cpp_script.exists(): + print(f"Error: build_cpp_extensions.py not found at {build_cpp_script}") sys.exit(1) - except Exception as e: - print(f"Unexpected error: {e}") + + try: + print("Building C++ extensions...") + subprocess.run( + [sys.executable, str(build_cpp_script), "build_ext", "--inplace"], + cwd=repo_root, + check=True, + ) + print("C++ extension build finished.") + except subprocess.CalledProcessError as e: + print(f"Error building C++ extensions: {e}") sys.exit(1) diff --git a/mypy.ini b/mypy.ini index d488c2a83..b034f60d4 100644 --- a/mypy.ini +++ b/mypy.ini @@ -16,6 +16,9 @@ ignore_missing_imports = True [mypy-setuptools] ignore_missing_imports = True +[mypy-setuptools.extension] +ignore_missing_imports = True + [mypy-tensorflow_data_validation] ignore_missing_imports = True diff --git a/requirements/install_cpp_deps.sh b/requirements/install_cpp_deps.sh new file mode 100644 index 000000000..25570eab8 --- /dev/null +++ b/requirements/install_cpp_deps.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Install C++ development tools: clang-format, clang-tidy, cmake. +# +# Usage: +# bash requirements/install_cpp_deps.sh +# +# Called by `make install_dev_deps` alongside install_py_deps.sh and +# install_scala_deps.sh. +# +# NOTE: On Linux, this script calls apt-get and update-alternatives, which +# require root privileges. Run as root or prefix with sudo. + +set -e +set -x + +CLANG_VERSION=15 + +is_running_on_mac() { + [ "$(uname)" == "Darwin" ] + return $? +} + +if is_running_on_mac; then + # macOS ships its own Apple Clang via Xcode Command Line Tools. Homebrew + # intentionally does not put its llvm binaries on PATH to avoid shadowing + # Apple's clang. We therefore have to add the Homebrew llvm bin directory + # to PATH ourselves so that `clang-format` and `clang-tidy` resolve to the + # Homebrew versions rather than being missing entirely. + brew install llvm cmake + LLVM_BIN="$(brew --prefix llvm)/bin" + + # Append to any shell rc files that exist and don't already include it. + for rc_file in ~/.zshrc ~/.bashrc; do + if [ -f "$rc_file" ] && ! grep -qF "$LLVM_BIN" "$rc_file"; then + printf '\n# Added by GiGL install_cpp_deps.sh\nexport PATH="%s:$PATH"\n' "$LLVM_BIN" >> "$rc_file" + echo "Added LLVM bin to PATH in $rc_file" + fi + done + + # NOTE: this export only affects subprocesses of this script, not the calling + # shell or make process. Open a new terminal (or run `source ~/.zshrc`) after + # install_dev_deps to pick up the PATH change. + export PATH="$LLVM_BIN:$PATH" +else + # On Linux, apt-get installs versioned binaries (e.g. clang-format-15) directly + # into /usr/bin. update-alternatives wires up the bare names (clang-format, + # clang-tidy) so callers don't need to specify the version suffix. No PATH + # changes are needed since /usr/bin is already on PATH. + apt-get install -y "clang-format-${CLANG_VERSION}" "clang-tidy-${CLANG_VERSION}" "clangd-${CLANG_VERSION}" cmake + update-alternatives --install /usr/bin/clang-format clang-format "/usr/bin/clang-format-${CLANG_VERSION}" 100 + update-alternatives --install /usr/bin/clang-tidy clang-tidy "/usr/bin/clang-tidy-${CLANG_VERSION}" 100 + update-alternatives --install /usr/bin/clangd clangd "/usr/bin/clangd-${CLANG_VERSION}" 100 +fi + +echo "Finished installing C++ tooling" diff --git a/scripts/_cpp_config.py b/scripts/_cpp_config.py new file mode 100644 index 000000000..e10ce1a0f --- /dev/null +++ b/scripts/_cpp_config.py @@ -0,0 +1,9 @@ +"""Shared C++ build configuration used by build_cpp_extensions.py and generate_compile_commands.py.""" + +COMPILE_ARGS: list[str] = [ + "-O3", + "-std=c++17", + "-Wall", + "-Wextra", + "-Wno-unused-parameter", +] diff --git a/scripts/build_cpp_extensions.py b/scripts/build_cpp_extensions.py new file mode 100644 index 000000000..42a983aa8 --- /dev/null +++ b/scripts/build_cpp_extensions.py @@ -0,0 +1,55 @@ +"""Build script for GiGL pybind11 C++ extensions. + +Invoked by ``make build_cpp_extensions`` and automatically during ``make install_dev_deps`` +via ``post_install.py``. Not a general-purpose setup.py — only builds C++ extensions. + +Usage:: + + python build_cpp_extensions.py build_ext --inplace +""" + +from pathlib import Path + +from setuptools import setup +from setuptools.extension import Extension +from torch.utils.cpp_extension import BuildExtension, CppExtension + +from scripts._cpp_config import COMPILE_ARGS + +_REPO_ROOT: Path = Path(__file__).resolve().parent.parent +_CSRC_DIR: Path = _REPO_ROOT / "gigl" / "csrc" + + +def find_cpp_extensions() -> list[Extension]: + """Auto-discover pybind11 extension modules under ``gigl/csrc/``. + + Following PyTorch's csrc convention, only files named ``python_*.cpp`` are + compiled as Python extension modules. + + Returns an empty list if ``gigl/csrc/`` does not yet exist. + """ + if not _CSRC_DIR.exists(): + return [] + extensions = [] + for cpp_file in sorted(_CSRC_DIR.rglob("python_*.cpp")): + parts = list(cpp_file.with_suffix("").parts) + parts[-1] = parts[-1].removeprefix("python_") + module_name = ".".join(parts) + impl_file = cpp_file.parent / (parts[-1] + ".cpp") + sources = [str(cpp_file)] + if impl_file.exists(): + sources.append(str(impl_file)) + extensions.append( + CppExtension( + name=module_name, + sources=sources, + extra_compile_args=COMPILE_ARGS, + ) + ) + return extensions + + +setup( + ext_modules=find_cpp_extensions(), + cmdclass={"build_ext": BuildExtension}, +) diff --git a/scripts/generate_compile_commands.py b/scripts/generate_compile_commands.py new file mode 100644 index 000000000..f6be10003 --- /dev/null +++ b/scripts/generate_compile_commands.py @@ -0,0 +1,100 @@ +"""Generate build/compile_commands.json for clangd. + +clangd requires a compilation database to resolve include paths and compiler +flags. This script derives those paths from the installed torch and pybind11 +packages and writes ``build/compile_commands.json``. + +Primary use: called by ``run_cpp_lint.py`` before running clangd checks, and +by ``make generate_compile_commands`` when you need to refresh the database +manually (e.g. after adding new source files or changing compiler flags). + +Usage:: + + make generate_compile_commands +""" + +import json +import subprocess +import sys +import sysconfig +import warnings +from pathlib import Path + +from scripts._cpp_config import COMPILE_ARGS + +_REPO_ROOT: Path = Path(__file__).resolve().parent.parent +_CSRC_DIR: Path = _REPO_ROOT / "gigl" / "csrc" +_COMPILE_COMMANDS: Path = _REPO_ROOT / "build" / "compile_commands.json" + + +def _get_cxx_system_include_flags() -> list[str]: + """Return -isystem flags for the C++ standard library headers. + + clang++-15 on some machines is not configured with the correct GCC toolchain + path, so it cannot find standard headers like on its own. We ask + the system g++ compiler for its include search paths and pass them explicitly, + which is the reliable cross-machine approach. + """ + try: + result = subprocess.run( + ["c++", "-v", "-x", "c++", "-E", "/dev/null"], + capture_output=True, + text=True, + ) + includes: list[str] = [] + in_section = False + for line in result.stderr.splitlines(): + if "#include <...> search starts here:" in line: + in_section = True + continue + if "End of search list." in line: + break + if in_section and line.strip(): + includes.append(f"-isystem{line.strip()}") + return includes + except FileNotFoundError: + print( + "Warning: 'c++' not found; C++ system headers will be missing", + file=sys.stderr, + ) + return [] + + +def write_compile_commands() -> None: + """Write build/compile_commands.json for clangd.""" + # Suppress PyTorch's CUDA-not-found warning emitted at import time. + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + from torch.utils.cpp_extension import include_paths as torch_include_paths + + # -isystem marks these as system headers so clangd skips deep analysis of + # PyTorch internals. + include_flags: list[str] = [f"-isystem{p}" for p in torch_include_paths()] + include_flags.append(f"-isystem{sysconfig.get_path('include')}") + include_flags.extend(_get_cxx_system_include_flags()) + + cpp_sources = sorted(_CSRC_DIR.rglob("*.cpp")) if _CSRC_DIR.exists() else [] + if not cpp_sources: + print(f"Warning: no .cpp files found under {_CSRC_DIR}", file=sys.stderr) + + cxx_flags = " ".join(COMPILE_ARGS) + commands: list[dict[str, str]] = [ + { + "directory": str(_REPO_ROOT), + "file": str(source), + "command": f"clang++ {cxx_flags} {' '.join(include_flags)} -c {source}", + } + for source in cpp_sources + ] + + _COMPILE_COMMANDS.parent.mkdir(exist_ok=True) + _COMPILE_COMMANDS.write_text(json.dumps(commands, indent=2)) + + +def main() -> None: + write_compile_commands() + print(f"Wrote {_COMPILE_COMMANDS}") + + +if __name__ == "__main__": + main() diff --git a/scripts/run_cpp_lint.py b/scripts/run_cpp_lint.py new file mode 100644 index 000000000..69c834fa0 --- /dev/null +++ b/scripts/run_cpp_lint.py @@ -0,0 +1,71 @@ +"""Run C++ lint on source files using clangd. + +Generates compile_commands.json, then runs clangd --check on each file in +parallel and prints a clean summary. + +Usage:: + + uv run python scripts/run_cpp_lint.py [file2.cpp] ... +""" + +import re +import subprocess +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +from scripts.generate_compile_commands import _COMPILE_COMMANDS, write_compile_commands + +# Matches real clang-tidy diagnostics emitted by clangd: +# E[HH:MM:SS.mmm] [check-name] Line N: message +_DIAGNOSTIC_RE = re.compile(r"^E\[[\d:.]+\] (\[.+\] .+)$") + + +def _check_file(source: Path) -> tuple[Path, list[str]]: + result = subprocess.run( + [ + "clangd", + f"--check={source}", + f"--compile-commands-dir={_COMPILE_COMMANDS.parent}", + ], + capture_output=True, + text=True, + ) + diagnostics: list[str] = [] + for line in result.stderr.splitlines(): + m = _DIAGNOSTIC_RE.match(line) + if m: + diagnostics.append(m.group(1)) + return source, diagnostics + + +def main() -> None: + sources = [Path(s) for s in sys.argv[1:]] + if not sources: + sys.exit(0) + + write_compile_commands() + + failures: dict[Path, list[str]] = {} + with ThreadPoolExecutor() as executor: + futures = {executor.submit(_check_file, s): s for s in sources} + for future in as_completed(futures): + source, diagnostics = future.result() + if diagnostics: + failures[source] = diagnostics + + if not failures: + print("\033[32mC++ lint passed.\033[0m") + else: + for source in sorted(failures): + print(f" FAIL {source}") + for d in failures[source]: + print(f" {d}") + print( + "\nRun \033[1mmake fix_lint_cpp\033[0m to auto-fix violations where possible." + ) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/tests/unit/cpp/CMakeLists.txt b/tests/unit/cpp/CMakeLists.txt new file mode 100644 index 000000000..514625587 --- /dev/null +++ b/tests/unit/cpp/CMakeLists.txt @@ -0,0 +1,43 @@ +cmake_minimum_required(VERSION 3.18) +project(GiGLCppTests CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# --------------------------------------------------------------------------- +# GoogleTest via FetchContent +# --------------------------------------------------------------------------- +include(FetchContent) +FetchContent_Declare( + googletest + URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.tar.gz + URL_HASH SHA256=8372520bab45e12a97cd2f49dad36d07e55ddb89c0e39fad4a1a64cab0bdf35d +) +# Prevent GoogleTest from overriding the compiler's runtime on Windows +# (no-op on Linux/Mac, but required for portable CMake config). +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(googletest) + +# Required for add_test() to register tests with CTest. +enable_testing() + +# --------------------------------------------------------------------------- +# Auto-discover test targets +# --------------------------------------------------------------------------- +# Any file named *_test.cpp in this directory (or subdirectories) is +# automatically compiled into its own test binary and registered with CTest. +# To add a new test suite, drop a *_test.cpp file here — no changes to this +# file required. This matches the *_test.py convention used for Python tests. +file(GLOB_RECURSE TEST_SOURCES "*_test.cpp") + +foreach(test_source ${TEST_SOURCES}) + # Derive the binary name from the filename, e.g.: + # ppr_forward_push_test.cpp → ppr_forward_push_test + get_filename_component(test_name ${test_source} NAME_WE) + add_executable(${test_name} ${test_source}) + target_link_libraries(${test_name} GTest::gtest_main) + # add_test registers the binary with CTest. Each *_test binary is one + # CTest entry; GoogleTest itself reports individual TEST() results inside it. + add_test(NAME ${test_name} COMMAND ${test_name}) +endforeach() diff --git a/tests/unit/cpp/infrastructure_test.cpp b/tests/unit/cpp/infrastructure_test.cpp new file mode 100644 index 000000000..af85a6660 --- /dev/null +++ b/tests/unit/cpp/infrastructure_test.cpp @@ -0,0 +1,13 @@ +// Placeholder C++ unit test. +// +// This file exists to verify that the GoogleTest infrastructure compiles and +// runs end-to-end. + +#include + +// A trivial sanity-check test — if this fails, something is very wrong with +// the build environment itself. +TEST(PlaceholderTest, BasicArithmetic) { + EXPECT_EQ(1 + 1, 2); + EXPECT_NE(1 + 1, 3); +}