From 87dc95b08b289bcc814daa8aebe5eebe8012a5bd Mon Sep 17 00:00:00 2001
From: mkolodner <mkolodner@snapchat.com>
Date: Tue, 24 Mar 2026 20:24:14 +0000
Subject: [PATCH 01/14] Initial commit

---
 gigl/distributed/cpp_extensions/__init__.py   |   9 +
 .../cpp_extensions/ppr_forward_push.cpp       | 294 +++++++++++++
 .../cpp_extensions/ppr_forward_push.pyi       |  21 +
 gigl/distributed/dist_ppr_sampler.py          | 388 +++++++-----------
 4 files changed, 463 insertions(+), 249 deletions(-)
 create mode 100644 gigl/distributed/cpp_extensions/__init__.py
 create mode 100644 gigl/distributed/cpp_extensions/ppr_forward_push.cpp
 create mode 100644 gigl/distributed/cpp_extensions/ppr_forward_push.pyi
diff --git a/gigl/distributed/cpp_extensions/__init__.py b/gigl/distributed/cpp_extensions/__init__.py
new file mode 100644
index 000000000..d375f59b1
--- /dev/null
+++ b/gigl/distributed/cpp_extensions/__init__.py
@@ -0,0 +1,9 @@
+try:
+    from gigl.distributed.cpp_extensions.ppr_forward_push import PPRForwardPushState
+except ImportError as e:
+    raise ImportError(
+        "PPR C++ extension not compiled. "
+        "Run `uv pip install -e .` from the GiGL root to build it."
+    ) from e
+
+__all__ = ["PPRForwardPushState"]
diff --git a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp
new file mode 100644
index 000000000..6f6d10545
--- /dev/null
+++ b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp
@@ -0,0 +1,294 @@
+#include <torch/extension.h>
+#include <pybind11/stl.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace py = pybind11;
+
+// Pack (node_id, etype_id) into a single uint64_t lookup key.
+// Requires both values fit in 32 bits — enforced by the Python caller.
+static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) {
+    return (static_cast<uint64_t>(static_cast<uint32_t>(node_id)) << 32) |
+           static_cast<uint32_t>(etype_id);
+}
+
+// C++ kernel for the PPR Forward Push algorithm (Andersen et al., 2006).
+//
+// Owned state: ppr_scores, residuals, queue, queued_nodes, neighbor_cache.
+// Python retains ownership of: the distributed neighbor fetch (_batch_fetch_neighbors).
+//
+// Typical call sequence per batch:
+//   1.  PPRForwardPushState(seed_nodes, ...)   — init per-seed residuals / queue
+//   while True:
+//   2.  drain_queue()                          — drain queue → nodes needing lookup
+//   3.  <Python: _batch_fetch_neighbors(...)>  — distributed RPC fetch (stays in Python)
+//   4.  push_residuals(fetched_by_etype_id)    — push residuals, update queue
+//   5.  extract_top_k(max_ppr_nodes)           — top-k selection per seed per node type
+class PPRForwardPushState {
+public:
+    PPRForwardPushState(
+        torch::Tensor seed_nodes,
+        int32_t seed_node_type_id,
+        float alpha,
+        float requeue_threshold_factor,
+        std::vector<std::vector<int32_t>> node_type_to_edge_type_ids,
+        std::vector<int32_t> edge_type_to_dst_ntype_id,
+        std::vector<torch::Tensor> degree_tensors
+    )
+        : alpha_(alpha),
+          one_minus_alpha_(1.0f - alpha),
+          requeue_threshold_factor_(requeue_threshold_factor),
+          node_type_to_edge_type_ids_(std::move(node_type_to_edge_type_ids)),
+          edge_type_to_dst_ntype_id_(std::move(edge_type_to_dst_ntype_id)),
+          degree_tensors_(std::move(degree_tensors)) {
+
+        TORCH_CHECK(seed_nodes.dim() == 1, "seed_nodes must be 1D");
+        batch_size_     = static_cast<int32_t>(seed_nodes.size(0));
+        num_node_types_ = static_cast<int32_t>(node_type_to_edge_type_ids_.size());
+
+        ppr_scores_.assign(batch_size_,    std::vector<std::unordered_map<int32_t, float>>(num_node_types_));
+        residuals_.assign(batch_size_,     std::vector<std::unordered_map<int32_t, float>>(num_node_types_));
+        queue_.assign(batch_size_,         std::vector<std::unordered_set<int32_t>>(num_node_types_));
+        queued_nodes_.assign(batch_size_,  std::vector<std::unordered_set<int32_t>>(num_node_types_));
+
+        auto acc = seed_nodes.accessor<int64_t, 1>();
+        num_nodes_in_queue_ = batch_size_;
+        for (int32_t i = 0; i < batch_size_; ++i) {
+            int32_t seed = static_cast<int32_t>(acc[i]);
+            residuals_[i][seed_node_type_id][seed] = alpha_;
+            queue_[i][seed_node_type_id].insert(seed);
+        }
+    }
+
+    // Drain all queued nodes and return {etype_id: tensor[node_ids]} for batch
+    // neighbor lookup.  Also snapshots the drained nodes into queued_nodes_ for
+    // use by push_residuals().
+    //
+    // Returns None when the queue is truly empty (convergence signal).
+    // Returns a dict (possibly empty) when nodes were drained but all had cached
+    // neighbors or no outgoing edges — push_residuals must still be called to
+    // flush their residuals into ppr_scores_.
+    py::object drain_queue() {
+        if (num_nodes_in_queue_ == 0) {
+            return py::none();
+        }
+
+        for (int32_t s = 0; s < batch_size_; ++s)
+            for (auto& qs : queued_nodes_[s]) qs.clear();
+
+        std::unordered_map<int32_t, std::unordered_set<int32_t>> nodes_to_lookup;
+
+        for (int32_t s = 0; s < batch_size_; ++s) {
+            for (int32_t nt = 0; nt < num_node_types_; ++nt) {
+                if (queue_[s][nt].empty()) continue;
+
+                // Snapshot queue into queued_nodes, then reset queue.
+                queued_nodes_[s][nt] = std::move(queue_[s][nt]);
+                queue_[s][nt].clear();
+                num_nodes_in_queue_ -= static_cast<int32_t>(queued_nodes_[s][nt].size());
+
+                for (int32_t node_id : queued_nodes_[s][nt]) {
+                    for (int32_t eid : node_type_to_edge_type_ids_[nt]) {
+                        // Only add to lookup if not already in the persistent cache.
+                        if (neighbor_cache_.find(pack_key(node_id, eid)) == neighbor_cache_.end()) {
+                            nodes_to_lookup[eid].insert(node_id);
+                        }
+                    }
+                }
+            }
+        }
+
+        py::dict result;
+        for (auto& [eid, node_set] : nodes_to_lookup) {
+            std::vector<int64_t> ids(node_set.begin(), node_set.end());
+            result[py::int_(eid)] = torch::tensor(ids, torch::kLong);
+        }
+        return result;
+    }
+
+    // Push residuals to neighbors given the fetched neighbor data.
+    // fetched_by_etype_id: {etype_id: (node_ids_tensor, flat_nbrs_tensor, counts_tensor)}
+    //   - node_ids_tensor:  [N] int64 — source node IDs fetched for this edge type
+    //   - flat_nbrs_tensor: [sum(counts)] int64 — flat concatenation of all neighbor lists
+    //   - counts_tensor:    [N] int64 — number of neighbors for each source node
+    void push_residuals(py::dict fetched_by_etype_id) {
+        // Build local fetched map: pack_key(node_id, etype_id) -> neighbor list.
+        std::unordered_map<uint64_t, std::vector<int32_t>> fetched;
+        for (auto item : fetched_by_etype_id) {
+            int32_t eid      = item.first.cast<int32_t>();
+            auto tup         = item.second.cast<py::tuple>();
+            auto node_ids_t  = tup[0].cast<torch::Tensor>();
+            auto flat_nbrs_t = tup[1].cast<torch::Tensor>();
+            auto counts_t    = tup[2].cast<torch::Tensor>();
+
+            auto node_acc  = node_ids_t.accessor<int64_t, 1>();
+            auto nbr_acc   = flat_nbrs_t.accessor<int64_t, 1>();
+            auto cnt_acc   = counts_t.accessor<int64_t, 1>();
+
+            int64_t offset = 0;
+            for (int64_t i = 0; i < node_ids_t.size(0); ++i) {
+                int32_t nid   = static_cast<int32_t>(node_acc[i]);
+                int64_t count = cnt_acc[i];
+                std::vector<int32_t> nbrs(count);
+                for (int64_t j = 0; j < count; ++j)
+                    nbrs[j] = static_cast<int32_t>(nbr_acc[offset + j]);
+                fetched[pack_key(nid, eid)] = std::move(nbrs);
+                offset += count;
+            }
+        }
+
+        for (int32_t s = 0; s < batch_size_; ++s) {
+            for (int32_t nt = 0; nt < num_node_types_; ++nt) {
+                if (queued_nodes_[s][nt].empty()) continue;
+
+                for (int32_t src : queued_nodes_[s][nt]) {
+                    auto& src_res = residuals_[s][nt];
+                    auto it = src_res.find(src);
+                    float res = (it != src_res.end()) ? it->second : 0.0f;
+
+                    ppr_scores_[s][nt][src] += res;
+                    src_res[src] = 0.0f;
+
+                    int32_t total_deg = get_total_degree(src, nt);
+                    if (total_deg == 0) continue;
+
+                    float res_per_nbr = one_minus_alpha_ * res / static_cast<float>(total_deg);
+
+                    for (int32_t eid : node_type_to_edge_type_ids_[nt]) {
+                        // fetched and neighbor_cache are mutually exclusive per iteration:
+                        // drain_queue only adds a node to nodes_to_lookup when absent from
+                        // neighbor_cache, so a given key appears in at most one of the two.
+                        const std::vector<int32_t>* nbr_list = nullptr;
+                        auto fi = fetched.find(pack_key(src, eid));
+                        if (fi != fetched.end()) {
+                            nbr_list = &fi->second;
+                        } else {
+                            auto ci = neighbor_cache_.find(pack_key(src, eid));
+                            if (ci != neighbor_cache_.end()) nbr_list = &ci->second;
+                        }
+                        if (!nbr_list || nbr_list->empty()) continue;
+
+                        int32_t dst_nt = edge_type_to_dst_ntype_id_[eid];
+
+                        for (int32_t nbr : *nbr_list) {
+                            residuals_[s][dst_nt][nbr] += res_per_nbr;
+
+                            float threshold = requeue_threshold_factor_ *
+                                static_cast<float>(get_total_degree(nbr, dst_nt));
+
+                            if (queue_[s][dst_nt].find(nbr) == queue_[s][dst_nt].end() &&
+                                residuals_[s][dst_nt][nbr] >= threshold) {
+                                queue_[s][dst_nt].insert(nbr);
+                                ++num_nodes_in_queue_;
+
+                                // Promote this node's neighbor lists to the persistent cache:
+                                // it will be processed next iteration, so caching now avoids
+                                // a re-fetch.  Nodes that are never requeued (typically
+                                // high-degree) are never promoted, keeping their large neighbor
+                                // lists out of the cache.
+                                for (int32_t peid : node_type_to_edge_type_ids_[dst_nt]) {
+                                    uint64_t pk = pack_key(nbr, peid);
+                                    if (neighbor_cache_.find(pk) == neighbor_cache_.end()) {
+                                        auto pfi = fetched.find(pk);
+                                        if (pfi != fetched.end())
+                                            neighbor_cache_[pk] = pfi->second;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // Extract top-k PPR nodes per seed per node type.
+    // Returns {ntype_id: (flat_ids_tensor, flat_weights_tensor, valid_counts_tensor)}.
+    // Only node types that received any PPR score are included in the output.
+    py::dict extract_top_k(int32_t max_ppr_nodes) {
+        std::unordered_set<int32_t> active;
+        for (int32_t s = 0; s < batch_size_; ++s)
+            for (int32_t nt = 0; nt < num_node_types_; ++nt)
+                if (!ppr_scores_[s][nt].empty()) active.insert(nt);
+
+        py::dict result;
+        for (int32_t nt : active) {
+            std::vector<int64_t> flat_ids;
+            std::vector<float>   flat_weights;
+            std::vector<int64_t> valid_counts;
+
+            for (int32_t s = 0; s < batch_size_; ++s) {
+                const auto& scores = ppr_scores_[s][nt];
+                int32_t k = std::min(max_ppr_nodes, static_cast<int32_t>(scores.size()));
+                if (k > 0) {
+                    std::vector<std::pair<int32_t, float>> items(scores.begin(), scores.end());
+                    std::partial_sort(items.begin(), items.begin() + k, items.end(),
+                        [](const auto& a, const auto& b) { return a.second > b.second; });
+                    for (int32_t i = 0; i < k; ++i) {
+                        flat_ids.push_back(static_cast<int64_t>(items[i].first));
+                        flat_weights.push_back(items[i].second);
+                    }
+                }
+                valid_counts.push_back(static_cast<int64_t>(k));
+            }
+
+            result[py::int_(nt)] = py::make_tuple(
+                torch::tensor(flat_ids, torch::kLong),
+                torch::tensor(flat_weights, torch::kFloat),
+                torch::tensor(valid_counts, torch::kLong)
+            );
+        }
+        return result;
+    }
+
+private:
+    int32_t get_total_degree(int32_t node_id, int32_t ntype_id) const {
+        if (ntype_id >= static_cast<int32_t>(degree_tensors_.size())) return 0;
+        const auto& t = degree_tensors_[ntype_id];
+        if (t.numel() == 0) return 0;  // destination-only type: no outgoing edges
+        TORCH_CHECK(
+            node_id < static_cast<int32_t>(t.size(0)),
+            "Node ID ", node_id, " out of range for degree tensor of ntype_id ", ntype_id,
+            " (size=", t.size(0), "). This indicates corrupted graph data or a sampler bug."
+        );
+        return t.data_ptr<int32_t>()[node_id];
+    }
+
+    float   alpha_, one_minus_alpha_, requeue_threshold_factor_;
+    int32_t batch_size_, num_node_types_, num_nodes_in_queue_{0};
+
+    std::vector<std::vector<int32_t>> node_type_to_edge_type_ids_;
+    std::vector<int32_t>              edge_type_to_dst_ntype_id_;
+    std::vector<torch::Tensor>        degree_tensors_;
+
+    // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id]).
+    std::vector<std::vector<std::unordered_map<int32_t, float>>> ppr_scores_;
+    std::vector<std::vector<std::unordered_map<int32_t, float>>> residuals_;
+    std::vector<std::vector<std::unordered_set<int32_t>>>         queue_;
+    // Snapshot of queue contents from the last drain_queue() call, used by push_residuals().
+    std::vector<std::vector<std::unordered_set<int32_t>>>         queued_nodes_;
+
+    // Persistent neighbor cache: pack_key(node_id, etype_id) -> neighbor list.
+    // Only nodes that have been requeued (and thus will be processed again) are
+    // promoted here from the per-iteration fetched map.
+    std::unordered_map<uint64_t, std::vector<int32_t>> neighbor_cache_;
+};
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    py::class_<PPRForwardPushState>(m, "PPRForwardPushState")
+        .def(py::init<
+            torch::Tensor,
+            int32_t,
+            float, float,
+            std::vector<std::vector<int32_t>>,
+            std::vector<int32_t>,
+            std::vector<torch::Tensor>
+        >())
+        .def("drain_queue",    &PPRForwardPushState::drain_queue)
+        .def("push_residuals", &PPRForwardPushState::push_residuals)
+        .def("extract_top_k",  &PPRForwardPushState::extract_top_k);
+}
diff --git a/gigl/distributed/cpp_extensions/ppr_forward_push.pyi b/gigl/distributed/cpp_extensions/ppr_forward_push.pyi
new file mode 100644
index 000000000..265468c3c
--- /dev/null
+++ b/gigl/distributed/cpp_extensions/ppr_forward_push.pyi
@@ -0,0 +1,21 @@
+import torch
+
+class PPRForwardPushState:
+    def __init__(
+        self,
+        seed_nodes: torch.Tensor,
+        seed_node_type_id: int,
+        alpha: float,
+        requeue_threshold_factor: float,
+        node_type_to_edge_type_ids: list[list[int]],
+        edge_type_to_dst_ntype_id: list[int],
+        degree_tensors: list[torch.Tensor],
+    ) -> None: ...
+    def drain_queue(self) -> dict[int, torch.Tensor] | None: ...
+    def push_residuals(
+        self,
+        fetched_by_etype_id: dict[int, tuple[torch.Tensor, torch.Tensor, torch.Tensor]],
+    ) -> None: ...
+    def extract_top_k(
+        self, max_ppr_nodes: int
+    ) -> dict[int, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]: ...
diff --git a/gigl/distributed/dist_ppr_sampler.py b/gigl/distributed/dist_ppr_sampler.py
index 17673a72d..cf4c732c0 100644
--- a/gigl/distributed/dist_ppr_sampler.py
+++ b/gigl/distributed/dist_ppr_sampler.py
@@ -1,14 +1,6 @@
-# TODO (mkolodner-sc): The forward push loop in _compute_ppr_scores is the
-# main throughput bottleneck — both the queue drain (preparing batched node
-# lookups by edge type) and the residual push/requeue pass are pure Python
-# dict/set operations in tight nested loops.  Moving these to a C++ extension
-# (e.g. pybind11) would eliminate per-operation Python overhead and enable
-# cache-friendly memory access patterns.
-
 # TODO (mkolodner-sc): Investigate whether concurrency for _sample_one_hop and _compute_ppr_scores will
 # yield performance benefits.
 
-import heapq
 from collections import defaultdict
 from typing import Optional, Union
 
@@ -22,6 +14,7 @@
 from graphlearn_torch.typing import EdgeType, NodeType
 from graphlearn_torch.utils import merge_dict
 
+from gigl.distributed.cpp_extensions import PPRForwardPushState
 from gigl.distributed.dist_neighbor_sampler import DistNeighborSampler
 from gigl.types.graph import is_label_edge_type
 
@@ -48,6 +41,49 @@
 )
 
 
+def _group_fetched_by_etype_id(
+    fetched: dict[tuple[int, EdgeType], list[int]],
+    etype_to_etype_id: dict[EdgeType, int],
+) -> dict[int, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
+    """Group batch-fetched neighbors by integer edge type ID for the C++ push kernel.
+
+    Performs one linear pass over the fetched dict, building flat tensors per
+    edge type.  This avoids per-neighbor Python overhead in push_residuals by
+    batching all lookups for the same edge type together.
+
+    Args:
+        fetched: Output of _batch_fetch_neighbors: ``(node_id, etype)`` →
+            neighbor list.
+        etype_to_etype_id: Mapping from EdgeType to its integer ID.
+
+    Returns:
+        Dict mapping etype_id to ``(node_ids, flat_neighbors, counts)`` as
+        flat int64 tensors.  ``flat_neighbors`` is the concatenation of all
+        neighbor lists for that edge type; ``counts[i]`` gives the neighbor
+        count for ``node_ids[i]``.
+    """
+    node_ids_by_etype: dict[int, list[int]] = {}
+    flat_nbrs_by_etype: dict[int, list[int]] = {}
+    counts_by_etype: dict[int, list[int]] = {}
+    for (node_id, etype), neighbors in fetched.items():
+        eid = etype_to_etype_id[etype]
+        if eid not in node_ids_by_etype:
+            node_ids_by_etype[eid] = []
+            flat_nbrs_by_etype[eid] = []
+            counts_by_etype[eid] = []
+        node_ids_by_etype[eid].append(node_id)
+        flat_nbrs_by_etype[eid].extend(neighbors)
+        counts_by_etype[eid].append(len(neighbors))
+    return {
+        eid: (
+            torch.tensor(node_ids_by_etype[eid], dtype=torch.long),
+            torch.tensor(flat_nbrs_by_etype[eid], dtype=torch.long),
+            torch.tensor(counts_by_etype[eid], dtype=torch.long),
+        )
+        for eid in node_ids_by_etype
+    }
+
+
 # TODO (mkolodner-sc): Consider introducing a BaseGiGLSampler that owns
 # shared utilities like _prepare_sample_loop_inputs, with KHopSampler and
 # PPRSampler as siblings.  Currently DistPPRNeighborSampler inherits from
@@ -157,6 +193,56 @@ def __init__(
             NodeType, torch.Tensor
         ] = self._build_total_degree_tensors(degree_tensors, total_degree_dtype)
 
+        # Build integer ID mappings for the C++ forward-push kernel.  String
+        # NodeType / EdgeType keys are only used at the Python boundary
+        # (translating to/from _batch_fetch_neighbors); all hot-loop state inside
+        # PPRForwardPushState is indexed by int32 IDs.
+        #
+        # We include both source types (have outgoing edges) and destination-only
+        # types (no outgoing edges, but may accumulate PPR score during the walk)
+        # so the kernel can index residual/ppr_score tables for any node it sees.
+        _all_node_types: list[NodeType] = sorted(
+            {nt for nt in self._node_type_to_edge_types}
+            | {
+                self._get_destination_type(et)
+                for etypes in self._node_type_to_edge_types.values()
+                for et in etypes
+            }
+        )
+        # dict.fromkeys preserves insertion order while deduplicating.
+        _all_edge_types: list[EdgeType] = list(
+            dict.fromkeys(
+                et for etypes in self._node_type_to_edge_types.values() for et in etypes
+            )
+        )
+
+        self._node_type_to_id: dict[NodeType, int] = {
+            nt: i for i, nt in enumerate(_all_node_types)
+        }
+        self._ntype_id_to_ntype: list[NodeType] = _all_node_types
+        self._etype_to_etype_id: dict[EdgeType, int] = {
+            et: i for i, et in enumerate(_all_edge_types)
+        }
+        self._etype_id_to_etype: list[EdgeType] = _all_edge_types
+
+        self._node_type_id_to_edge_type_ids: list[list[int]] = [
+            [
+                self._etype_to_etype_id[et]
+                for et in self._node_type_to_edge_types.get(nt, [])
+            ]
+            for nt in _all_node_types
+        ]
+        self._edge_type_id_to_dst_ntype_id: list[int] = [
+            self._node_type_to_id[self._get_destination_type(et)]
+            for et in _all_edge_types
+        ]
+        # Degree tensors indexed by ntype_id.  Destination-only types get an empty
+        # tensor; the C++ kernel returns 0 for those, matching _get_total_degree.
+        self._degree_tensors_for_cpp: list[torch.Tensor] = [
+            self._node_type_to_total_degree.get(nt, torch.zeros(0, dtype=torch.int32))
+            for nt in _all_node_types
+        ]
+
     def _build_total_degree_tensors(
         self,
         degree_tensors: Union[torch.Tensor, dict[EdgeType, torch.Tensor]],
@@ -209,35 +295,6 @@ def _build_total_degree_tensors(
 
         return result
 
-    def _get_total_degree(self, node_id: int, node_type: NodeType) -> int:
-        """Look up the precomputed total degree of a node.
-
-        Args:
-            node_id: The ID of the node to look up.
-            node_type: The node type.
-
-        Returns:
-            The total degree (sum across all edge types) for the node.
-
-        Raises:
-            ValueError: If the node ID is out of range, indicating corrupted
-                graph data or a sampler bug.
-        """
-        # Destination-only node types (no outgoing edges) are absent from
-        # _node_type_to_total_degree because total degree is only computed for
-        # traversable source types.  Returning 0 here is correct: such nodes
-        # act as terminals — they accumulate PPR score but never push residual
-        # further.
-        if node_type not in self._node_type_to_total_degree:
-            return 0
-        degree_tensor = self._node_type_to_total_degree[node_type]
-        if node_id >= len(degree_tensor):
-            raise ValueError(
-                f"Node ID {node_id} exceeds total degree tensor length "
-                f"({len(degree_tensor)}) for node type {node_type}."
-            )
-        return int(degree_tensor[node_id].item())
-
     def _get_destination_type(self, edge_type: EdgeType) -> NodeType:
         """Get the node type at the destination end of an edge type."""
         return edge_type[0] if self.edge_dir == "in" else edge_type[-1]
@@ -369,226 +426,59 @@ async def _compute_ppr_scores(
         if seed_node_type is None:
             seed_node_type = _PPR_HOMOGENEOUS_NODE_TYPE
         device = seed_nodes.device
-        batch_size = seed_nodes.size(0)
-
-        # Per-seed PPR state, nested by node type for efficient type-grouped access.
-
-        # ppr_scores[i][node_type][node_id] = accumulated PPR score for node_id
-        # of type node_type, relative to seed i.  Updated each iteration by
-        # absorbing the node's residual.
-        ppr_scores: list[dict[NodeType, dict[int, float]]] = [
-            defaultdict(lambda: defaultdict(float)) for _ in range(batch_size)
-        ]
 
-        # residuals[i][node_type][node_id] = unconverged probability mass at node_id
-        # of type node_type for seed i.  Each iteration, a node's residual is
-        # absorbed into its PPR score and then distributed to its neighbors.
-        residuals: list[dict[NodeType, dict[int, float]]] = [
-            defaultdict(lambda: defaultdict(float)) for _ in range(batch_size)
-        ]
-
-        # queue[i][node_type] = set of node IDs whose residual exceeds the
-        # convergence threshold (alpha * eps * total_degree).  The algorithm
-        # terminates when all queues are empty.  A set is used because multiple
-        # neighbors can push residual to the same node in one iteration —
-        # deduplication avoids redundant processing, and the O(1) membership
-        # check matters since it runs in the innermost loop.
-        queue: list[dict[NodeType, set[int]]] = [
-            defaultdict(set) for _ in range(batch_size)
-        ]
-
-        seed_list = seed_nodes.tolist()
-
-        for i, seed in enumerate(seed_list):
-            residuals[i][seed_node_type][seed] = self._alpha
-            queue[i][seed_node_type].add(seed)
-
-        # Cache keyed by (node_id, edge_type) since same node can have different neighbors per edge type
-        neighbor_cache: dict[tuple[int, EdgeType], list[int]] = {}
-
-        num_nodes_in_queue = batch_size
-        one_minus_alpha = 1 - self._alpha
-
-        while num_nodes_in_queue > 0:
-            # Drain all nodes from all queues and group by edge type for batched lookups
-            queued_nodes: list[dict[NodeType, set[int]]] = [
-                defaultdict(set) for _ in range(batch_size)
-            ]
-            nodes_to_lookup: dict[EdgeType, set[int]] = defaultdict(set)
-
-            for seed_idx in range(batch_size):
-                if queue[seed_idx]:
-                    queued_nodes[seed_idx] = queue[seed_idx]
-                    queue[seed_idx] = defaultdict(set)
-                    for node_type, node_ids in queued_nodes[seed_idx].items():
-                        num_nodes_in_queue -= len(node_ids)
-                        # We fetch neighbors for ALL edge types originating
-                        # from this node type, not just the edge type that
-                        # caused the node to be queued.  This is required for
-                        # correctness: forward push distributes residual to
-                        # all neighbors proportionally by total degree, so
-                        # every edge type must be considered.
-                        # Destination-only types have no entry in _node_type_to_edge_types;
-                        # .get() returns [] so we skip neighbor lookup for them.
-                        edge_types_for_node = self._node_type_to_edge_types.get(
-                            node_type, []
-                        )
-                        for node_id in node_ids:
-                            for etype in edge_types_for_node:
-                                cache_key = (node_id, etype)
-                                if cache_key not in neighbor_cache:
-                                    # TODO (mkolodner-sc): Investigate switching from set to list
-                                    # here.  _sample_one_hop handles duplicates correctly (second
-                                    # write to result[(node_id, etype)] is a no-op overwrite), so
-                                    # dedup is not required for correctness.  A list would avoid
-                                    # per-add hash cost and the set->list->tensor conversion in
-                                    # _batch_fetch_neighbors, though at the cost of redundant
-                                    # network calls for any duplicate nodes across seeds.
-                                    nodes_to_lookup[etype].add(node_id)
-
-            fetched_neighbors = await self._batch_fetch_neighbors(
-                nodes_to_lookup=nodes_to_lookup,
-                device=device,
-            )
-            # fetched_neighbors is intentionally NOT merged into neighbor_cache
-            # upfront.  We only promote entries when a node is requeued — see
-            # the should_requeue block below.
-
-            # Push residual to neighbors and re-queue in a single pass.  This
-            # is safe because each seed's state is independent, and residuals
-            # are always positive so the merged loop can never miss a re-queue.
-            for seed_idx in range(batch_size):
-                for source_type, source_nodes in queued_nodes[seed_idx].items():
-                    for source_node in source_nodes:
-                        source_residual = residuals[seed_idx][source_type].get(
-                            source_node, 0.0
-                        )
-
-                        ppr_scores[seed_idx][source_type][
-                            source_node
-                        ] += source_residual
-                        residuals[seed_idx][source_type][source_node] = 0.0
-
-                        # Same destination-only guard as in the queue drain loop above.
-                        edge_types_for_node = self._node_type_to_edge_types.get(
-                            source_type, []
-                        )
-
-                        total_degree = self._get_total_degree(source_node, source_type)
-
-                        if total_degree == 0:
-                            continue
+        ppr_state = PPRForwardPushState(
+            seed_nodes,
+            self._node_type_to_id[seed_node_type],
+            self._alpha,
+            self._requeue_threshold_factor,
+            self._node_type_id_to_edge_type_ids,
+            self._edge_type_id_to_dst_ntype_id,
+            self._degree_tensors_for_cpp,
+        )
 
-                        residual_per_neighbor = (
-                            one_minus_alpha * source_residual / total_degree
-                        )
+        while True:
+            # drain_queue returns None when the queue is truly empty (convergence),
+            # or a dict (possibly empty) when nodes were drained.  An empty dict
+            # means all drained nodes either had cached neighbors or no outgoing
+            # edges — we still call push_residuals to flush their residuals into
+            # ppr_scores_.
+            drain_result: dict[int, torch.Tensor] | None = ppr_state.drain_queue()
+            if drain_result is None:
+                break
+
+            nodes_by_etype_id: dict[int, torch.Tensor] = drain_result
+            if nodes_by_etype_id:
+                # Translate integer etype IDs back to EdgeType for the distributed
+                # fetch layer.  O(num_active_etypes) — negligible vs. RPC round-trip.
+                nodes_to_lookup: dict[EdgeType, set[int]] = {
+                    self._etype_id_to_etype[eid]: set(t.tolist())
+                    for eid, t in nodes_by_etype_id.items()
+                }
+                fetched_neighbors = await self._batch_fetch_neighbors(
+                    nodes_to_lookup, device
+                )
+                fetched_by_etype_id = _group_fetched_by_etype_id(
+                    fetched_neighbors, self._etype_to_etype_id
+                )
+            else:
+                fetched_by_etype_id = {}
 
-                        for etype in edge_types_for_node:
-                            cache_key = (source_node, etype)
-                            # fetched_neighbors and neighbor_cache are mutually
-                            # exclusive per iteration: the queue drain only adds
-                            # a node to nodes_to_lookup if it is absent from
-                            # neighbor_cache, so a key appears in at most one.
-                            neighbor_list = fetched_neighbors.get(
-                                cache_key, neighbor_cache.get(cache_key, [])
-                            )
-                            if not neighbor_list:
-                                continue
-
-                            neighbor_type = self._get_destination_type(etype)
-
-                            for neighbor_node in neighbor_list:
-                                residuals[seed_idx][neighbor_type][
-                                    neighbor_node
-                                ] += residual_per_neighbor
-
-                                requeue_threshold = (
-                                    self._requeue_threshold_factor
-                                    * self._get_total_degree(
-                                        neighbor_node, neighbor_type
-                                    )
-                                )
-                                should_requeue = (
-                                    neighbor_node not in queue[seed_idx][neighbor_type]
-                                    and residuals[seed_idx][neighbor_type][
-                                        neighbor_node
-                                    ]
-                                    >= requeue_threshold
-                                )
-                                if should_requeue:
-                                    queue[seed_idx][neighbor_type].add(neighbor_node)
-                                    num_nodes_in_queue += 1
-                                    # Promote this node's neighbor lists to the
-                                    # persistent cache: it will be processed next
-                                    # iteration, so caching now avoids a re-fetch.
-                                    # Nodes that are never requeued (typically
-                                    # high-degree) are never promoted, keeping
-                                    # their large neighbor lists out of the cache.
-                                    for (
-                                        promote_etype
-                                    ) in self._node_type_to_edge_types.get(
-                                        neighbor_type, []
-                                    ):
-                                        promote_key = (neighbor_node, promote_etype)
-                                        if (
-                                            promote_key in fetched_neighbors
-                                            and promote_key not in neighbor_cache
-                                        ):
-                                            neighbor_cache[
-                                                promote_key
-                                            ] = fetched_neighbors[promote_key]
-
-        # Extract top-k nodes by PPR score, grouped by node type.
-        # Results are three flat tensors per node type (no padding):
-        #   - flat_ids:      [id_seed0_0, id_seed0_1, ..., id_seed1_0, ...]
-        #   - flat_weights:  [wt_seed0_0, wt_seed0_1, ..., wt_seed1_0, ...]
-        #   - valid_counts:  [count_seed0, count_seed1, ...]
-        #
-        # valid_counts[i] records how many top-k neighbors seed i contributed.
-        # The inducer uses valid_counts to slice flat_ids into per-seed groups
-        # and assign local indices.  Example:
-        #
-        #   4 seeds, valid_counts = [1, 6, 2, 1]  (10 total pairs)
-        #   flat_ids = [d0a, d1a, d1b, d1c, d1d, d1e, d1f, d2a, d2b, d3a]
-        #
-        #   seed 0 owns flat_ids[0:1],  seed 1 owns flat_ids[1:7],
-        #   seed 2 owns flat_ids[7:9],  seed 3 owns flat_ids[9:10]
-        # _node_type_to_edge_types only contains source types; destination-only
-        # types are absent but may have accumulated PPR scores during the walk.
-        # We union with all types seen in ppr_scores so they appear in the output.
-        all_node_types: set[NodeType] = set(self._node_type_to_edge_types.keys())
-        for seed_ppr in ppr_scores:
-            all_node_types.update(seed_ppr.keys())
+            ppr_state.push_residuals(fetched_by_etype_id)
 
+        # Translate ntype_id integer keys back to NodeType strings for the rest
+        # of the pipeline, and move tensors to the correct device.
         ntype_to_flat_ids: dict[NodeType, torch.Tensor] = {}
         ntype_to_flat_weights: dict[NodeType, torch.Tensor] = {}
         ntype_to_valid_counts: dict[NodeType, torch.Tensor] = {}
 
-        for ntype in all_node_types:
-            flat_ids: list[int] = []
-            flat_weights: list[float] = []
-            valid_counts: list[int] = []
-
-            for i in range(batch_size):
-                type_scores = ppr_scores[i].get(ntype, {})
-                top_k = heapq.nlargest(
-                    self._max_ppr_nodes, type_scores.items(), key=lambda x: x[1]
-                )
-                if top_k:
-                    ids, weights = zip(*top_k)
-                    flat_ids.extend(ids)
-                    flat_weights.extend(weights)
-                valid_counts.append(len(top_k))
-
-            ntype_to_flat_ids[ntype] = torch.tensor(
-                flat_ids, dtype=torch.long, device=device
-            )
-            ntype_to_flat_weights[ntype] = torch.tensor(
-                flat_weights, dtype=torch.float, device=device
-            )
-            ntype_to_valid_counts[ntype] = torch.tensor(
-                valid_counts, dtype=torch.long, device=device
-            )
+        for ntype_id, (flat_ids, flat_weights, valid_counts) in ppr_state.extract_top_k(
+            self._max_ppr_nodes
+        ).items():
+            ntype = self._ntype_id_to_ntype[ntype_id]
+            ntype_to_flat_ids[ntype] = flat_ids.to(device)
+            ntype_to_flat_weights[ntype] = flat_weights.to(device)
+            ntype_to_valid_counts[ntype] = valid_counts.to(device)
 
         if self._is_homogeneous:
             assert (

From a23179686c7025a9e8428267743a036f0b532e63 Mon Sep 17 00:00:00 2001
From: mkolodner <mkolodner@snapchat.com>
Date: Tue, 24 Mar 2026 21:07:02 +0000
Subject: [PATCH 02/14] small precision fix

---
 .../cpp_extensions/ppr_forward_push.cpp       | 35 ++++++++++---------
 .../unit/distributed/dist_ppr_sampler_test.py | 13 ++++---
 2 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp
index 6f6d10545..e50373dcd 100644
--- a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp
+++ b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp
@@ -33,14 +33,14 @@ class PPRForwardPushState {
     PPRForwardPushState(
         torch::Tensor seed_nodes,
         int32_t seed_node_type_id,
-        float alpha,
-        float requeue_threshold_factor,
+        double alpha,
+        double requeue_threshold_factor,
         std::vector<std::vector<int32_t>> node_type_to_edge_type_ids,
         std::vector<int32_t> edge_type_to_dst_ntype_id,
         std::vector<torch::Tensor> degree_tensors
     )
         : alpha_(alpha),
-          one_minus_alpha_(1.0f - alpha),
+          one_minus_alpha_(1.0 - alpha),
           requeue_threshold_factor_(requeue_threshold_factor),
           node_type_to_edge_type_ids_(std::move(node_type_to_edge_type_ids)),
           edge_type_to_dst_ntype_id_(std::move(edge_type_to_dst_ntype_id)),
@@ -50,8 +50,8 @@ class PPRForwardPushState {
         batch_size_     = static_cast<int32_t>(seed_nodes.size(0));
         num_node_types_ = static_cast<int32_t>(node_type_to_edge_type_ids_.size());
 
-        ppr_scores_.assign(batch_size_,    std::vector<std::unordered_map<int32_t, float>>(num_node_types_));
-        residuals_.assign(batch_size_,     std::vector<std::unordered_map<int32_t, float>>(num_node_types_));
+        ppr_scores_.assign(batch_size_,    std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
+        residuals_.assign(batch_size_,     std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
         queue_.assign(batch_size_,         std::vector<std::unordered_set<int32_t>>(num_node_types_));
         queued_nodes_.assign(batch_size_,  std::vector<std::unordered_set<int32_t>>(num_node_types_));
 
@@ -148,15 +148,15 @@ class PPRForwardPushState {
                 for (int32_t src : queued_nodes_[s][nt]) {
                     auto& src_res = residuals_[s][nt];
                     auto it = src_res.find(src);
-                    float res = (it != src_res.end()) ? it->second : 0.0f;
+                    double res = (it != src_res.end()) ? it->second : 0.0;
 
                     ppr_scores_[s][nt][src] += res;
-                    src_res[src] = 0.0f;
+                    src_res[src] = 0.0;
 
                     int32_t total_deg = get_total_degree(src, nt);
                     if (total_deg == 0) continue;
 
-                    float res_per_nbr = one_minus_alpha_ * res / static_cast<float>(total_deg);
+                    double res_per_nbr = one_minus_alpha_ * res / static_cast<double>(total_deg);
 
                     for (int32_t eid : node_type_to_edge_type_ids_[nt]) {
                         // fetched and neighbor_cache are mutually exclusive per iteration:
@@ -177,8 +177,8 @@ class PPRForwardPushState {
                         for (int32_t nbr : *nbr_list) {
                             residuals_[s][dst_nt][nbr] += res_per_nbr;
 
-                            float threshold = requeue_threshold_factor_ *
-                                static_cast<float>(get_total_degree(nbr, dst_nt));
+                            double threshold = requeue_threshold_factor_ *
+                                static_cast<double>(get_total_degree(nbr, dst_nt));
 
                             if (queue_[s][dst_nt].find(nbr) == queue_[s][dst_nt].end() &&
                                 residuals_[s][dst_nt][nbr] >= threshold) {
@@ -225,12 +225,12 @@ class PPRForwardPushState {
                 const auto& scores = ppr_scores_[s][nt];
                 int32_t k = std::min(max_ppr_nodes, static_cast<int32_t>(scores.size()));
                 if (k > 0) {
-                    std::vector<std::pair<int32_t, float>> items(scores.begin(), scores.end());
+                    std::vector<std::pair<int32_t, double>> items(scores.begin(), scores.end());
                     std::partial_sort(items.begin(), items.begin() + k, items.end(),
                         [](const auto& a, const auto& b) { return a.second > b.second; });
                     for (int32_t i = 0; i < k; ++i) {
                         flat_ids.push_back(static_cast<int64_t>(items[i].first));
-                        flat_weights.push_back(items[i].second);
+                        flat_weights.push_back(static_cast<float>(items[i].second));
                     }
                 }
                 valid_counts.push_back(static_cast<int64_t>(k));
@@ -258,7 +258,7 @@ class PPRForwardPushState {
         return t.data_ptr<int32_t>()[node_id];
     }
 
-    float   alpha_, one_minus_alpha_, requeue_threshold_factor_;
+    double  alpha_, one_minus_alpha_, requeue_threshold_factor_;
     int32_t batch_size_, num_node_types_, num_nodes_in_queue_{0};
 
     std::vector<std::vector<int32_t>> node_type_to_edge_type_ids_;
@@ -266,8 +266,11 @@ class PPRForwardPushState {
     std::vector<torch::Tensor>        degree_tensors_;
 
     // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id]).
-    std::vector<std::vector<std::unordered_map<int32_t, float>>> ppr_scores_;
-    std::vector<std::vector<std::unordered_map<int32_t, float>>> residuals_;
+    // double precision avoids float32 rounding errors accumulating over 20-30
+    // push iterations, which would otherwise cause ~1e-4 score errors vs the
+    // true PPR.  Output weights are cast to float32 in extract_top_k.
+    std::vector<std::vector<std::unordered_map<int32_t, double>>> ppr_scores_;
+    std::vector<std::vector<std::unordered_map<int32_t, double>>> residuals_;
     std::vector<std::vector<std::unordered_set<int32_t>>>         queue_;
     // Snapshot of queue contents from the last drain_queue() call, used by push_residuals().
     std::vector<std::vector<std::unordered_set<int32_t>>>         queued_nodes_;
@@ -283,7 +286,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         .def(py::init<
             torch::Tensor,
             int32_t,
-            float, float,
+            double, double,
             std::vector<std::vector<int32_t>>,
             std::vector<int32_t>,
             std::vector<torch::Tensor>
diff --git a/tests/unit/distributed/dist_ppr_sampler_test.py b/tests/unit/distributed/dist_ppr_sampler_test.py
index 5ad96e66e..a4f6d160b 100644
--- a/tests/unit/distributed/dist_ppr_sampler_test.py
+++ b/tests/unit/distributed/dist_ppr_sampler_test.py
@@ -270,8 +270,9 @@ def _assert_ppr_scores_match_reference(
     """Assert sampler PPR scores match reference scores per node type.
 
     Checks that top-k node sets are identical and that per-node scores
-    are within atol=1e-6.  The forward push error per node is bounded by
-    O(alpha * eps * degree); observed deltas are ~1e-7 for eps=1e-6.
+    are within atol=1e-5.  The forward push error per node is bounded by
+    O(alpha * eps * degree); for max degree 3, alpha=0.5, eps=1e-6 the
+    theoretical bound is ~1.5e-6, so 1e-5 provides a safety margin.
 
     Args:
         ntype_to_sampler_ppr: Sampler output from :func:`_extract_hetero_ppr_scores`.
@@ -290,7 +291,7 @@ def _assert_ppr_scores_match_reference(
         for node_id in reference_ppr[ntype_str]:
             ref_score = reference_ppr[ntype_str][node_id]
             sam_score = ntype_to_sampler_ppr[ntype_str][node_id]
-            assert abs(sam_score - ref_score) < 1e-6, (
+            assert abs(sam_score - ref_score) < 1e-5, (
                 f"{seed_id}, type {ntype_str}, node {node_id}: "
                 f"sampler={sam_score:.8f} vs reference={ref_score:.8f}"
             )
@@ -372,11 +373,13 @@ def _run_ppr_loader_correctness_check(
         )
 
         # Forward push is an approximation; with eps=1e-6 the per-node error
-        # is bounded by O(alpha * eps * degree).  Observed deltas are ~1e-7.
+        # is bounded by O(alpha * eps * degree).  For this test graph
+        # (max degree 3, alpha=0.5, eps=1e-6) the theoretical bound is ~1.5e-6.
+        # Tolerance is set to 1e-5 to provide a safety margin above that bound.
         for node_id in reference_ppr:
             ref_score = reference_ppr[node_id]
             sam_score = sampler_ppr[node_id]
-            assert abs(sam_score - ref_score) < 1e-6, (
+            assert abs(sam_score - ref_score) < 1e-5, (
                 f"Seed {seed_global_id}, node {node_id}: "
                 f"sampler={sam_score:.8f} vs reference={ref_score:.8f}"
             )

From a19db887809278de0de5427be033798031ddb41b Mon Sep 17 00:00:00 2001
From: mkolodner <mkolodner@snapchat.com>
Date: Tue, 24 Mar 2026 21:36:32 +0000
Subject: [PATCH 03/14] Optimize

---
 gigl/distributed/dist_ppr_sampler.py | 118 ++++++---------------------
 1 file changed, 24 insertions(+), 94 deletions(-)

diff --git a/gigl/distributed/dist_ppr_sampler.py b/gigl/distributed/dist_ppr_sampler.py
index cf4c732c0..b63329357 100644
--- a/gigl/distributed/dist_ppr_sampler.py
+++ b/gigl/distributed/dist_ppr_sampler.py
@@ -41,49 +41,6 @@
 )
 
 
-def _group_fetched_by_etype_id(
-    fetched: dict[tuple[int, EdgeType], list[int]],
-    etype_to_etype_id: dict[EdgeType, int],
-) -> dict[int, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
-    """Group batch-fetched neighbors by integer edge type ID for the C++ push kernel.
-
-    Performs one linear pass over the fetched dict, building flat tensors per
-    edge type.  This avoids per-neighbor Python overhead in push_residuals by
-    batching all lookups for the same edge type together.
-
-    Args:
-        fetched: Output of _batch_fetch_neighbors: ``(node_id, etype)`` →
-            neighbor list.
-        etype_to_etype_id: Mapping from EdgeType to its integer ID.
-
-    Returns:
-        Dict mapping etype_id to ``(node_ids, flat_neighbors, counts)`` as
-        flat int64 tensors.  ``flat_neighbors`` is the concatenation of all
-        neighbor lists for that edge type; ``counts[i]`` gives the neighbor
-        count for ``node_ids[i]``.
-    """
-    node_ids_by_etype: dict[int, list[int]] = {}
-    flat_nbrs_by_etype: dict[int, list[int]] = {}
-    counts_by_etype: dict[int, list[int]] = {}
-    for (node_id, etype), neighbors in fetched.items():
-        eid = etype_to_etype_id[etype]
-        if eid not in node_ids_by_etype:
-            node_ids_by_etype[eid] = []
-            flat_nbrs_by_etype[eid] = []
-            counts_by_etype[eid] = []
-        node_ids_by_etype[eid].append(node_id)
-        flat_nbrs_by_etype[eid].extend(neighbors)
-        counts_by_etype[eid].append(len(neighbors))
-    return {
-        eid: (
-            torch.tensor(node_ids_by_etype[eid], dtype=torch.long),
-            torch.tensor(flat_nbrs_by_etype[eid], dtype=torch.long),
-            torch.tensor(counts_by_etype[eid], dtype=torch.long),
-        )
-        for eid in node_ids_by_etype
-    }
-
-
 # TODO (mkolodner-sc): Consider introducing a BaseGiGLSampler that owns
 # shared utilities like _prepare_sample_loop_inputs, with KHopSampler and
 # PPRSampler as siblings.  Currently DistPPRNeighborSampler inherits from
@@ -195,7 +152,7 @@ def __init__(
 
         # Build integer ID mappings for the C++ forward-push kernel.  String
         # NodeType / EdgeType keys are only used at the Python boundary
-        # (translating to/from _batch_fetch_neighbors); all hot-loop state inside
+        # (translating to/from _sample_one_hop); all hot-loop state inside
         # PPRForwardPushState is indexed by int32 IDs.
         #
         # We include both source types (have outgoing edges) and destination-only
@@ -301,68 +258,50 @@ def _get_destination_type(self, edge_type: EdgeType) -> NodeType:
 
     async def _batch_fetch_neighbors(
         self,
-        nodes_to_lookup: dict[EdgeType, set[int]],
+        nodes_by_etype_id: dict[int, torch.Tensor],
         device: torch.device,
-    ) -> dict[tuple[int, EdgeType], list[int]]:
-        """Batch fetch neighbors for nodes grouped by edge type.
+    ) -> dict[int, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
+        """Batch fetch neighbors for nodes grouped by integer edge type ID.
 
         Issues one ``_sample_one_hop`` call per edge type (not per node), so all
         nodes of the same edge type are fetched in a single RPC round-trip. Each
         node's neighbor list is capped at ``self._num_neighbors_per_hop``.
 
         Args:
-            nodes_to_lookup: Dict mapping each edge type to the set of node IDs
-                whose neighbors should be fetched via that edge type.  Only nodes
-                absent from the caller's ``neighbor_cache`` should be included.
+            nodes_by_etype_id: Dict mapping integer edge type ID to a 1-D int64
+                tensor of node IDs to fetch neighbors for.  Comes directly from
+                ``drain_queue()``; node IDs are already deduplicated.
             device: Torch device for intermediate tensor creation.
 
         Returns:
-            Dict mapping ``(node_id, edge_type)`` to the list of neighbor node IDs
-            returned by ``_sample_one_hop``.  Only nodes that appeared in
-            ``nodes_to_lookup`` are present; edge types with an empty node set are
-            skipped entirely.
+            Dict mapping etype_id to ``(node_ids, flat_neighbors, counts)`` as
+            int64 tensors, ready to pass directly to ``push_residuals``.
+            ``flat_neighbors`` is the flat concatenation of all neighbor lists
+            for that edge type; ``counts[i]`` is the neighbor count for
+            ``node_ids[i]``.
 
         Example::
 
-            nodes_to_lookup = {
-                ("user", "buys", "item"): {0, 3},
-                ("item", "bought_by", "user"): {7},
+            nodes_by_etype_id = {
+                2: tensor([0, 3]),   # etype_id 2 → nodes 0 and 3
+                5: tensor([7]),      # etype_id 5 → node 7
             }
             # Might return (neighbor lists depend on graph structure):
             {
-                (0, ("user", "buys", "item")): [5, 9, 2],
-                (3, ("user", "buys", "item")): [1],
-                (7, ("item", "bought_by", "user")): [0, 3],
+                2: (tensor([0, 3]), tensor([5, 9, 2, 1]), tensor([3, 1])),
+                5: (tensor([7]),    tensor([0, 3]),        tensor([2])),
             }
         """
-        result: dict[tuple[int, EdgeType], list[int]] = {}
-        for etype, node_ids in nodes_to_lookup.items():
-            if not node_ids:
-                continue
-            nodes_list = list(node_ids)
-            lookup_tensor = torch.tensor(nodes_list, dtype=torch.long, device=device)
-
+        result: dict[int, tuple[torch.Tensor, torch.Tensor, torch.Tensor]] = {}
+        for eid, node_ids_tensor in nodes_by_etype_id.items():
+            etype = self._etype_id_to_etype[eid]
             # _sample_one_hop expects None for homogeneous graphs, not the PPR sentinel.
             output: NeighborOutput = await self._sample_one_hop(
-                srcs=lookup_tensor,
+                srcs=node_ids_tensor.to(device),
                 num_nbr=self._num_neighbors_per_hop,
                 etype=etype if etype != _PPR_HOMOGENEOUS_EDGE_TYPE else None,
             )
-            neighbors = output.nbr
-            neighbor_counts = output.nbr_num
-
-            # TODO (mkolodner-sc): Investigate performance of a vectorized version of the below code
-            neighbors_list = neighbors.tolist()
-            counts_list = neighbor_counts.tolist()
-            del neighbors, neighbor_counts
-
-            # neighbors_list is a flat concatenation of all neighbors for all looked-up nodes.
-            # We use offset to slice out each node's neighbors: node i's neighbors are at
-            # neighbors_list[offset : offset + count], then we advance offset by count.
-            offset = 0
-            for node_id, count in zip(nodes_list, counts_list):
-                result[(node_id, etype)] = neighbors_list[offset : offset + count]
-                offset += count
+            result[eid] = (node_ids_tensor, output.nbr, output.nbr_num)
 
         return result
 
@@ -449,17 +388,8 @@ async def _compute_ppr_scores(
 
             nodes_by_etype_id: dict[int, torch.Tensor] = drain_result
             if nodes_by_etype_id:
-                # Translate integer etype IDs back to EdgeType for the distributed
-                # fetch layer.  O(num_active_etypes) — negligible vs. RPC round-trip.
-                nodes_to_lookup: dict[EdgeType, set[int]] = {
-                    self._etype_id_to_etype[eid]: set(t.tolist())
-                    for eid, t in nodes_by_etype_id.items()
-                }
-                fetched_neighbors = await self._batch_fetch_neighbors(
-                    nodes_to_lookup, device
-                )
-                fetched_by_etype_id = _group_fetched_by_etype_id(
-                    fetched_neighbors, self._etype_to_etype_id
+                fetched_by_etype_id = await self._batch_fetch_neighbors(
+                    nodes_by_etype_id, device
                 )
             else:
                 fetched_by_etype_id = {}

From fed381545fc7923d7377b69513480e615057a588 Mon Sep 17 00:00:00 2001
From: mkolodner <mkolodner@snapchat.com>
Date: Tue, 24 Mar 2026 21:42:45 +0000
Subject: [PATCH 04/14] Add explanatory comments to ppr_forward_push.cpp for
 C++ newcomers

---
 .../cpp_extensions/ppr_forward_push.cpp       | 244 +++++++++++++++---
 1 file changed, 203 insertions(+), 41 deletions(-)

diff --git a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp
index e50373dcd..e22ac264f 100644
--- a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp
+++ b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp
@@ -1,16 +1,30 @@
-#include <torch/extension.h>
-#include <pybind11/stl.h>
+#include <torch/extension.h>  // PyTorch C++ API (tensors, TORCH_CHECK)
+#include <pybind11/stl.h>      // Automatic conversion between C++ containers and Python types
 
-#include <algorithm>
-#include <cstdint>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
+#include <algorithm>    // std::partial_sort, std::min
+#include <cstdint>      // Fixed-width integer types: int32_t, int64_t, uint32_t, uint64_t
+#include <unordered_map>  // std::unordered_map — like Python dict, O(1) average lookup
+#include <unordered_set>  // std::unordered_set — like Python set, O(1) average lookup
+#include <vector>         // std::vector — like Python list, contiguous in memory
 
-namespace py = pybind11;
+namespace py = pybind11;  // Alias for the pybind11 namespace (bridges C++ ↔ Python)
 
-// Pack (node_id, etype_id) into a single uint64_t lookup key.
-// Requires both values fit in 32 bits — enforced by the Python caller.
+// Combine (node_id, etype_id) into a single 64-bit integer for use as a hash
+// map key.  A single 64-bit integer is cheaper to hash than a pair of two
+// integers (std::unordered_map has no built-in pair hash).
+//
+// Bit layout:
+//   bits 63–32: node_id  (upper half)
+//   bits 31– 0: etype_id (lower half)
+//
+// Both inputs are cast through uint32_t before packing.  Without this, a
+// negative int32_t (e.g. -1 = 0xFFFFFFFF) would be sign-extended to a full
+// 64-bit value, corrupting the upper bits when shifted.  Reinterpreting as
+// uint32_t first treats the bit pattern as-is (no sign extension).
+//
+// `static inline` means: define this function here in the translation unit
+// (not in a separate object file) and ask the compiler to inline it at each
+// call site instead of generating a function call.
 static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) {
     return (static_cast<uint64_t>(static_cast<uint32_t>(node_id)) << 32) |
            static_cast<uint32_t>(etype_id);
@@ -18,6 +32,10 @@ static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) {
 
 // C++ kernel for the PPR Forward Push algorithm (Andersen et al., 2006).
 //
+// All hot-loop state (scores, residuals, queue, neighbor cache) lives inside
+// this object.  The distributed neighbor fetch is kept in Python because it
+// involves async RPC calls that C++ cannot drive directly.
+//
 // Owned state: ppr_scores, residuals, queue, queued_nodes, neighbor_cache.
 // Python retains ownership of: the distributed neighbor fetch (_batch_fetch_neighbors).
 //
@@ -42,6 +60,9 @@ class PPRForwardPushState {
         : alpha_(alpha),
           one_minus_alpha_(1.0 - alpha),
           requeue_threshold_factor_(requeue_threshold_factor),
+          // std::move transfers ownership of each vector into the member variable
+          // without copying its contents — equivalent to Python's list hand-off
+          // when you no longer need the original.
           node_type_to_edge_type_ids_(std::move(node_type_to_edge_type_ids)),
           edge_type_to_dst_ntype_id_(std::move(edge_type_to_dst_ntype_id)),
           degree_tensors_(std::move(degree_tensors)) {
@@ -50,15 +71,26 @@ class PPRForwardPushState {
         batch_size_     = static_cast<int32_t>(seed_nodes.size(0));
         num_node_types_ = static_cast<int32_t>(node_type_to_edge_type_ids_.size());
 
-        ppr_scores_.assign(batch_size_,    std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
-        residuals_.assign(batch_size_,     std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
-        queue_.assign(batch_size_,         std::vector<std::unordered_set<int32_t>>(num_node_types_));
-        queued_nodes_.assign(batch_size_,  std::vector<std::unordered_set<int32_t>>(num_node_types_));
-
+        // Allocate per-seed, per-node-type tables.
+        // .assign(n, val) fills a vector with n copies of val — like [val] * n in Python.
+        // Each inner element is an empty hash map / hash set for that (seed, ntype) pair.
+        ppr_scores_.assign(batch_size_,   std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
+        residuals_.assign(batch_size_,    std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
+        queue_.assign(batch_size_,        std::vector<std::unordered_set<int32_t>>(num_node_types_));
+        queued_nodes_.assign(batch_size_, std::vector<std::unordered_set<int32_t>>(num_node_types_));
+
+        // accessor<dtype, ndim>() returns a typed view into the tensor's data that
+        // supports [i] indexing with bounds checking in debug builds.  Here we read
+        // each seed node ID from the 1-D int64 tensor.
         auto acc = seed_nodes.accessor<int64_t, 1>();
         num_nodes_in_queue_ = batch_size_;
         for (int32_t i = 0; i < batch_size_; ++i) {
+            // static_cast<int32_t>: explicit narrowing from int64 to int32.
+            // The Python caller guarantees node IDs fit in 32 bits.
             int32_t seed = static_cast<int32_t>(acc[i]);
+            // PPR initialisation: each seed starts with residual = alpha (the
+            // restart probability).  The first push will move alpha into ppr_score
+            // and distribute (1-alpha)*alpha to the seed's neighbors.
             residuals_[i][seed_node_type_id][seed] = alpha_;
             queue_[i][seed_node_type_id].insert(seed);
         }
@@ -68,32 +100,45 @@ class PPRForwardPushState {
     // neighbor lookup.  Also snapshots the drained nodes into queued_nodes_ for
     // use by push_residuals().
     //
-    // Returns None when the queue is truly empty (convergence signal).
-    // Returns a dict (possibly empty) when nodes were drained but all had cached
-    // neighbors or no outgoing edges — push_residuals must still be called to
-    // flush their residuals into ppr_scores_.
+    // Return value semantics (py::object can hold any Python value):
+    //   - py::none()  → queue was already empty; convergence achieved; stop the loop.
+    //   - py::dict{}  → nodes were drained.  The dict maps etype_id → 1-D int64
+    //                   tensor of node IDs that need neighbor lookups this round.
+    //                   May be empty if all drained nodes were already in the cache
+    //                   or had no outgoing edges — push_residuals must still be called
+    //                   to flush their accumulated residual into ppr_scores_.
     py::object drain_queue() {
         if (num_nodes_in_queue_ == 0) {
             return py::none();
         }
 
+        // Reset the snapshot from the previous iteration.  `auto&` is a reference
+        // (alias) to the existing set — clearing it modifies the original in-place
+        // rather than operating on a copy.
         for (int32_t s = 0; s < batch_size_; ++s)
             for (auto& qs : queued_nodes_[s]) qs.clear();
 
+        // nodes_to_lookup[eid] = set of node IDs that need a neighbor fetch for
+        // edge type eid this round.  Using a set deduplicates nodes that appear
+        // in multiple seeds' queues: we only fetch each (node, etype) pair once
+        // regardless of how many seeds need it.
         std::unordered_map<int32_t, std::unordered_set<int32_t>> nodes_to_lookup;
 
         for (int32_t s = 0; s < batch_size_; ++s) {
             for (int32_t nt = 0; nt < num_node_types_; ++nt) {
                 if (queue_[s][nt].empty()) continue;
 
-                // Snapshot queue into queued_nodes, then reset queue.
+                // Move the live queue into the snapshot (no data copy — O(1)).
+                // queue_ is then reset to an empty set so new entries added by
+                // push_residuals() in this same iteration don't interfere.
                 queued_nodes_[s][nt] = std::move(queue_[s][nt]);
                 queue_[s][nt].clear();
                 num_nodes_in_queue_ -= static_cast<int32_t>(queued_nodes_[s][nt].size());
 
                 for (int32_t node_id : queued_nodes_[s][nt]) {
                     for (int32_t eid : node_type_to_edge_type_ids_[nt]) {
-                        // Only add to lookup if not already in the persistent cache.
+                        // Only request a fetch if the neighbor list isn't already
+                        // cached from a previous iteration.
                         if (neighbor_cache_.find(pack_key(node_id, eid)) == neighbor_cache_.end()) {
                             nodes_to_lookup[eid].insert(node_id);
                         }
@@ -102,8 +147,13 @@ class PPRForwardPushState {
             }
         }
 
+        // Convert to Python: {etype_id (int) → 1-D int64 tensor of node IDs}.
+        // py::int_(eid) wraps a C++ int as a Python int so it can be used as a
+        // dict key on the Python side.
         py::dict result;
         for (auto& [eid, node_set] : nodes_to_lookup) {
+            // Copy the set into a vector first: torch::tensor() requires a
+            // contiguous sequence, not an unordered_set iterator.
             std::vector<int64_t> ids(node_set.begin(), node_set.end());
             result[py::int_(eid)] = torch::tensor(ids, torch::kLong);
         }
@@ -111,24 +161,38 @@ class PPRForwardPushState {
     }
 
     // Push residuals to neighbors given the fetched neighbor data.
+    //
     // fetched_by_etype_id: {etype_id: (node_ids_tensor, flat_nbrs_tensor, counts_tensor)}
-    //   - node_ids_tensor:  [N] int64 — source node IDs fetched for this edge type
-    //   - flat_nbrs_tensor: [sum(counts)] int64 — flat concatenation of all neighbor lists
-    //   - counts_tensor:    [N] int64 — number of neighbors for each source node
+    //   - node_ids_tensor:  [N]           int64 — source node IDs fetched for this edge type
+    //   - flat_nbrs_tensor: [sum(counts)] int64 — all neighbor lists concatenated flat
+    //   - counts_tensor:    [N]           int64 — neighbor count for each source node
+    //
+    // For example, if nodes 3 and 7 were fetched for etype 0:
+    //   node_ids  = [3, 7]
+    //   flat_nbrs = [10, 11, 12, 20]   ← node 3 has nbrs {10,11,12}, node 7 has nbr {20}
+    //   counts    = [3, 1]
     void push_residuals(py::dict fetched_by_etype_id) {
-        // Build local fetched map: pack_key(node_id, etype_id) -> neighbor list.
+        // Step 1: Unpack the Python dict into a C++ map for fast lookup during
+        // the residual-push loop below.
+        // fetched: pack_key(node_id, etype_id) → neighbor list (as int32_t vector)
         std::unordered_map<uint64_t, std::vector<int32_t>> fetched;
         for (auto item : fetched_by_etype_id) {
-            int32_t eid      = item.first.cast<int32_t>();
+            int32_t eid = item.first.cast<int32_t>();
+            // .cast<py::tuple>() interprets the Python value as a tuple so we
+            // can index into it with [0], [1], [2].
             auto tup         = item.second.cast<py::tuple>();
             auto node_ids_t  = tup[0].cast<torch::Tensor>();
             auto flat_nbrs_t = tup[1].cast<torch::Tensor>();
             auto counts_t    = tup[2].cast<torch::Tensor>();
 
-            auto node_acc  = node_ids_t.accessor<int64_t, 1>();
-            auto nbr_acc   = flat_nbrs_t.accessor<int64_t, 1>();
-            auto cnt_acc   = counts_t.accessor<int64_t, 1>();
+            // accessor<int64_t, 1>() gives a bounds-checked, typed 1-D view into
+            // each tensor's data — equivalent to iterating over a NumPy array.
+            auto node_acc = node_ids_t.accessor<int64_t, 1>();
+            auto nbr_acc  = flat_nbrs_t.accessor<int64_t, 1>();
+            auto cnt_acc  = counts_t.accessor<int64_t, 1>();
 
+            // Walk the flat neighbor list, slicing out each node's neighbors using
+            // the running offset into the concatenated flat buffer.
             int64_t offset = 0;
             for (int64_t i = 0; i < node_ids_t.size(0); ++i) {
                 int32_t nid   = static_cast<int32_t>(node_acc[i]);
@@ -136,54 +200,85 @@ class PPRForwardPushState {
                 std::vector<int32_t> nbrs(count);
                 for (int64_t j = 0; j < count; ++j)
                     nbrs[j] = static_cast<int32_t>(nbr_acc[offset + j]);
+                // std::move: hand off nbrs to the map without copying its contents.
                 fetched[pack_key(nid, eid)] = std::move(nbrs);
                 offset += count;
             }
         }
 
+        // Step 2: For every node that was in the queue (captured in queued_nodes_
+        // by drain_queue()), apply one PPR push step:
+        //   a. Absorb residual into the PPR score.
+        //   b. Distribute (1-alpha) * residual equally to each neighbor.
+        //   c. Enqueue any neighbor whose residual now exceeds the requeue threshold.
         for (int32_t s = 0; s < batch_size_; ++s) {
             for (int32_t nt = 0; nt < num_node_types_; ++nt) {
                 if (queued_nodes_[s][nt].empty()) continue;
 
                 for (int32_t src : queued_nodes_[s][nt]) {
+                    // `auto&` gives a reference to the residual map for this
+                    // (seed, node_type) pair so we can read and write it without
+                    // an extra hash lookup each time.
                     auto& src_res = residuals_[s][nt];
+                    // .find() returns an iterator; .end() means "not found".
+                    // We treat a missing entry as residual = 0.
                     auto it = src_res.find(src);
                     double res = (it != src_res.end()) ? it->second : 0.0;
 
+                    // a. Absorb: move residual into the PPR score.
                     ppr_scores_[s][nt][src] += res;
                     src_res[src] = 0.0;
 
                     int32_t total_deg = get_total_degree(src, nt);
+                    // Destination-only nodes (no outgoing edges) absorb residual
+                    // into their PPR score but do not push further.
                     if (total_deg == 0) continue;
 
+                    // b. Distribute: each neighbor of src (across all edge types
+                    // from nt) receives an equal share of the pushed residual.
                     double res_per_nbr = one_minus_alpha_ * res / static_cast<double>(total_deg);
 
                     for (int32_t eid : node_type_to_edge_type_ids_[nt]) {
-                        // fetched and neighbor_cache are mutually exclusive per iteration:
-                        // drain_queue only adds a node to nodes_to_lookup when absent from
-                        // neighbor_cache, so a given key appears in at most one of the two.
+                        // Invariant: fetched and neighbor_cache_ are mutually exclusive for
+                        // any given (node, etype) key within one iteration.  drain_queue()
+                        // only requests a fetch for nodes absent from neighbor_cache_, so a
+                        // key is in at most one of the two.  We check fetched first since it
+                        // is the common case for newly-seen nodes.
+                        //
+                        // `const std::vector<int32_t>*` is a pointer to a neighbor list.
+                        // We use a pointer (rather than copying the list) so we can check
+                        // for absence with nullptr without allocating anything.
                         const std::vector<int32_t>* nbr_list = nullptr;
                         auto fi = fetched.find(pack_key(src, eid));
                         if (fi != fetched.end()) {
+                            // `&fi->second` takes the address of the vector stored in
+                            // the map — nbr_list now points to it without copying.
                             nbr_list = &fi->second;
                         } else {
                             auto ci = neighbor_cache_.find(pack_key(src, eid));
                             if (ci != neighbor_cache_.end()) nbr_list = &ci->second;
                         }
+                        // Skip if no neighbor list is available (node has no edges of
+                        // this type, or the fetch returned an empty list).
                         if (!nbr_list || nbr_list->empty()) continue;
 
                         int32_t dst_nt = edge_type_to_dst_ntype_id_[eid];
 
+                        // c. For each neighbor, accumulate residual and check threshold.
+                        // `*nbr_list` dereferences the pointer to access the vector.
                         for (int32_t nbr : *nbr_list) {
                             residuals_[s][dst_nt][nbr] += res_per_nbr;
 
                             double threshold = requeue_threshold_factor_ *
                                 static_cast<double>(get_total_degree(nbr, dst_nt));
 
+                            // Only enqueue if: (1) not already in queue for this
+                            // iteration, and (2) residual exceeds the push threshold
+                            // alpha * eps * degree.
                             if (queue_[s][dst_nt].find(nbr) == queue_[s][dst_nt].end() &&
                                 residuals_[s][dst_nt][nbr] >= threshold) {
                                 queue_[s][dst_nt].insert(nbr);
-                                ++num_nodes_in_queue_;
+                                ++num_nodes_in_queue_;  // ++x is equivalent to x += 1
 
                                 // Promote this node's neighbor lists to the persistent cache:
                                 // it will be processed next iteration, so caching now avoids
@@ -207,9 +302,16 @@ class PPRForwardPushState {
     }
 
     // Extract top-k PPR nodes per seed per node type.
+    //
     // Returns {ntype_id: (flat_ids_tensor, flat_weights_tensor, valid_counts_tensor)}.
     // Only node types that received any PPR score are included in the output.
+    //
+    // Output layout for a batch of B seeds (same structure as _batch_fetch_neighbors):
+    //   flat_ids[0 : valid_counts[0]]                 → top-k nodes for seed 0
+    //   flat_ids[valid_counts[0] : valid_counts[0]+valid_counts[1]] → top-k for seed 1
+    //   ...
     py::dict extract_top_k(int32_t max_ppr_nodes) {
+        // Collect node types that have any PPR score — skip types with no activity.
         std::unordered_set<int32_t> active;
         for (int32_t s = 0; s < batch_size_; ++s)
             for (int32_t nt = 0; nt < num_node_types_; ++nt)
@@ -217,25 +319,42 @@ class PPRForwardPushState {
 
         py::dict result;
         for (int32_t nt : active) {
+            // Flat output vectors — entries for all seeds are concatenated.
             std::vector<int64_t> flat_ids;
             std::vector<float>   flat_weights;
             std::vector<int64_t> valid_counts;
 
             for (int32_t s = 0; s < batch_size_; ++s) {
+                // `const auto&` is a read-only reference — we iterate the map
+                // without copying it.
                 const auto& scores = ppr_scores_[s][nt];
+                // Cap k at the number of nodes that actually have a score.
                 int32_t k = std::min(max_ppr_nodes, static_cast<int32_t>(scores.size()));
                 if (k > 0) {
+                    // Copy the map entries into a vector of (node_id, score) pairs
+                    // so they can be sorted.  std::pair is like a Python 2-tuple.
                     std::vector<std::pair<int32_t, double>> items(scores.begin(), scores.end());
+
+                    // std::partial_sort rearranges items so that the first k entries
+                    // are the k largest — like Python's heapq.nlargest but in-place.
+                    // The lambda `[](const auto& a, const auto& b) { return ...; }`
+                    // is an anonymous comparator (like Python's `key=` argument).
+                    // `.second` accesses the score (second element of the pair);
+                    // `>` makes it descending (highest score first).
                     std::partial_sort(items.begin(), items.begin() + k, items.end(),
                         [](const auto& a, const auto& b) { return a.second > b.second; });
+
                     for (int32_t i = 0; i < k; ++i) {
                         flat_ids.push_back(static_cast<int64_t>(items[i].first));
+                        // Cast to float32 for output; internal scores stay double to
+                        // avoid accumulated rounding errors in the push loop above.
                         flat_weights.push_back(static_cast<float>(items[i].second));
                     }
                 }
                 valid_counts.push_back(static_cast<int64_t>(k));
             }
 
+            // py::make_tuple wraps C++ values into a Python tuple.
             result[py::int_(nt)] = py::make_tuple(
                 torch::tensor(flat_ids, torch::kLong),
                 torch::tensor(flat_weights, torch::kFloat),
@@ -246,6 +365,8 @@ class PPRForwardPushState {
     }
 
 private:
+    // Look up the total (across all edge types) out-degree of a node.
+    // Returns 0 for destination-only node types (no outgoing edges).
     int32_t get_total_degree(int32_t node_id, int32_t ntype_id) const {
         if (ntype_id >= static_cast<int32_t>(degree_tensors_.size())) return 0;
         const auto& t = degree_tensors_[ntype_id];
@@ -255,34 +376,75 @@ class PPRForwardPushState {
             "Node ID ", node_id, " out of range for degree tensor of ntype_id ", ntype_id,
             " (size=", t.size(0), "). This indicates corrupted graph data or a sampler bug."
         );
+        // data_ptr<int32_t>() returns a raw C pointer to the tensor's int32 data
+        // buffer.  Direct pointer indexing ([node_id]) is safe here because we
+        // validated the bounds with TORCH_CHECK above.
         return t.data_ptr<int32_t>()[node_id];
     }
 
-    double  alpha_, one_minus_alpha_, requeue_threshold_factor_;
-    int32_t batch_size_, num_node_types_, num_nodes_in_queue_{0};
-
+    // -------------------------------------------------------------------------
+    // Scalar algorithm parameters
+    // -------------------------------------------------------------------------
+    double  alpha_;                       // Restart probability
+    double  one_minus_alpha_;             // 1 - alpha, precomputed to avoid repeated subtraction
+    double  requeue_threshold_factor_;    // alpha * eps; multiplied by degree to get per-node threshold
+
+    int32_t batch_size_;                  // Number of seeds in the current batch
+    int32_t num_node_types_;              // Total number of node types (homo + hetero)
+    int32_t num_nodes_in_queue_{0};       // Running count of nodes across all seeds / types
+
+    // -------------------------------------------------------------------------
+    // Graph structure (read-only after construction)
+    // -------------------------------------------------------------------------
+    // node_type_to_edge_type_ids_[ntype_id] → list of edge type IDs that can be
+    // traversed from that node type (outgoing or incoming, depending on edge_dir).
     std::vector<std::vector<int32_t>> node_type_to_edge_type_ids_;
+    // edge_type_to_dst_ntype_id_[eid] → node type ID at the destination end.
     std::vector<int32_t>              edge_type_to_dst_ntype_id_;
+    // degree_tensors_[ntype_id][node_id] → total degree of that node across all
+    // edge types traversable from its type.  Empty tensor means no outgoing edges.
     std::vector<torch::Tensor>        degree_tensors_;
 
-    // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id]).
+    // -------------------------------------------------------------------------
+    // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id])
+    // -------------------------------------------------------------------------
     // double precision avoids float32 rounding errors accumulating over 20-30
     // push iterations, which would otherwise cause ~1e-4 score errors vs the
     // true PPR.  Output weights are cast to float32 in extract_top_k.
+    //
+    // ppr_scores_[s][nt]: node_id → absorbed PPR score (Σ of residuals pushed so far)
     std::vector<std::vector<std::unordered_map<int32_t, double>>> ppr_scores_;
+    // residuals_[s][nt]: node_id → unabsorbed probability mass waiting to be pushed
     std::vector<std::vector<std::unordered_map<int32_t, double>>> residuals_;
+    // queue_[s][nt]: nodes whose residual exceeds the threshold and need a push next round
     std::vector<std::vector<std::unordered_set<int32_t>>>         queue_;
-    // Snapshot of queue contents from the last drain_queue() call, used by push_residuals().
+    // queued_nodes_[s][nt]: snapshot of queue_ taken by drain_queue() for the current round.
+    // Separating it from queue_ lets push_residuals() enqueue new nodes into queue_ without
+    // modifying the set currently being iterated.
     std::vector<std::vector<std::unordered_set<int32_t>>>         queued_nodes_;
 
-    // Persistent neighbor cache: pack_key(node_id, etype_id) -> neighbor list.
-    // Only nodes that have been requeued (and thus will be processed again) are
-    // promoted here from the per-iteration fetched map.
+    // -------------------------------------------------------------------------
+    // Neighbor cache
+    // -------------------------------------------------------------------------
+    // Persistent cache: pack_key(node_id, etype_id) → neighbor list.
+    // Only nodes that have been re-queued (and will therefore be processed again)
+    // are promoted here from the per-iteration fetched map in push_residuals().
+    // This avoids re-fetching neighbors for nodes processed in multiple iterations
+    // while keeping large neighbor lists of high-degree (never-requeued) nodes
+    // out of memory.
     std::unordered_map<uint64_t, std::vector<int32_t>> neighbor_cache_;
 };
 
+// Register PPRForwardPushState with Python via pybind11.
+//
+// TORCH_EXTENSION_NAME is set by PyTorch's setup() at build time to match the
+// Python module name (e.g. "ppr_forward_push").  At import time, Python calls
+// this function to populate the module with the C++ class.
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     py::class_<PPRForwardPushState>(m, "PPRForwardPushState")
+        // .def(py::init<...>()) exposes the constructor.  The template arguments
+        // list the exact C++ parameter types so pybind11 can convert Python
+        // arguments to the correct C++ types automatically.
         .def(py::init<
             torch::Tensor,
             int32_t,

From 906df014092f388eba9274121e6d838c81b8abdd Mon Sep 17 00:00:00 2001
From: mkolodner <mkolodner@snapchat.com>
Date: Wed, 25 Mar 2026 18:48:27 +0000
Subject: [PATCH 05/14] Apply clang-format to ppr_forward_push.cpp

---
 .../cpp_extensions/ppr_forward_push.cpp       | 127 +++++++++---------
 1 file changed, 63 insertions(+), 64 deletions(-)

diff --git a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp
index e22ac264f..0af3eb2b5 100644
--- a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp
+++ b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp
@@ -1,8 +1,8 @@
+#include <pybind11/stl.h>     // Automatic conversion between C++ containers and Python types
 #include <torch/extension.h>  // PyTorch C++ API (tensors, TORCH_CHECK)
-#include <pybind11/stl.h>      // Automatic conversion between C++ containers and Python types
 
-#include <algorithm>    // std::partial_sort, std::min
-#include <cstdint>      // Fixed-width integer types: int32_t, int64_t, uint32_t, uint64_t
+#include <algorithm>      // std::partial_sort, std::min
+#include <cstdint>        // Fixed-width integer types: int32_t, int64_t, uint32_t, uint64_t
 #include <unordered_map>  // std::unordered_map — like Python dict, O(1) average lookup
 #include <unordered_set>  // std::unordered_set — like Python set, O(1) average lookup
 #include <vector>         // std::vector — like Python list, contiguous in memory
@@ -47,16 +47,12 @@ static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) {
 //   4.  push_residuals(fetched_by_etype_id)    — push residuals, update queue
 //   5.  extract_top_k(max_ppr_nodes)           — top-k selection per seed per node type
 class PPRForwardPushState {
-public:
-    PPRForwardPushState(
-        torch::Tensor seed_nodes,
-        int32_t seed_node_type_id,
-        double alpha,
-        double requeue_threshold_factor,
-        std::vector<std::vector<int32_t>> node_type_to_edge_type_ids,
-        std::vector<int32_t> edge_type_to_dst_ntype_id,
-        std::vector<torch::Tensor> degree_tensors
-    )
+   public:
+    PPRForwardPushState(torch::Tensor seed_nodes, int32_t seed_node_type_id, double alpha,
+                        double requeue_threshold_factor,
+                        std::vector<std::vector<int32_t>> node_type_to_edge_type_ids,
+                        std::vector<int32_t> edge_type_to_dst_ntype_id,
+                        std::vector<torch::Tensor> degree_tensors)
         : alpha_(alpha),
           one_minus_alpha_(1.0 - alpha),
           requeue_threshold_factor_(requeue_threshold_factor),
@@ -66,7 +62,6 @@ class PPRForwardPushState {
           node_type_to_edge_type_ids_(std::move(node_type_to_edge_type_ids)),
           edge_type_to_dst_ntype_id_(std::move(edge_type_to_dst_ntype_id)),
           degree_tensors_(std::move(degree_tensors)) {
-
         TORCH_CHECK(seed_nodes.dim() == 1, "seed_nodes must be 1D");
         batch_size_     = static_cast<int32_t>(seed_nodes.size(0));
         num_node_types_ = static_cast<int32_t>(node_type_to_edge_type_ids_.size());
@@ -74,15 +69,18 @@ class PPRForwardPushState {
         // Allocate per-seed, per-node-type tables.
         // .assign(n, val) fills a vector with n copies of val — like [val] * n in Python.
         // Each inner element is an empty hash map / hash set for that (seed, ntype) pair.
-        ppr_scores_.assign(batch_size_,   std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
-        residuals_.assign(batch_size_,    std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
-        queue_.assign(batch_size_,        std::vector<std::unordered_set<int32_t>>(num_node_types_));
-        queued_nodes_.assign(batch_size_, std::vector<std::unordered_set<int32_t>>(num_node_types_));
+        ppr_scores_.assign(batch_size_,
+                           std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
+        residuals_.assign(batch_size_,
+                          std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
+        queue_.assign(batch_size_, std::vector<std::unordered_set<int32_t>>(num_node_types_));
+        queued_nodes_.assign(batch_size_,
+                             std::vector<std::unordered_set<int32_t>>(num_node_types_));
 
         // accessor<dtype, ndim>() returns a typed view into the tensor's data that
         // supports [i] indexing with bounds checking in debug builds.  Here we read
         // each seed node ID from the 1-D int64 tensor.
-        auto acc = seed_nodes.accessor<int64_t, 1>();
+        auto acc            = seed_nodes.accessor<int64_t, 1>();
         num_nodes_in_queue_ = batch_size_;
         for (int32_t i = 0; i < batch_size_; ++i) {
             // static_cast<int32_t>: explicit narrowing from int64 to int32.
@@ -116,7 +114,8 @@ class PPRForwardPushState {
         // (alias) to the existing set — clearing it modifies the original in-place
         // rather than operating on a copy.
         for (int32_t s = 0; s < batch_size_; ++s)
-            for (auto& qs : queued_nodes_[s]) qs.clear();
+            for (auto& qs : queued_nodes_[s])
+                qs.clear();
 
         // nodes_to_lookup[eid] = set of node IDs that need a neighbor fetch for
         // edge type eid this round.  Using a set deduplicates nodes that appear
@@ -126,7 +125,8 @@ class PPRForwardPushState {
 
         for (int32_t s = 0; s < batch_size_; ++s) {
             for (int32_t nt = 0; nt < num_node_types_; ++nt) {
-                if (queue_[s][nt].empty()) continue;
+                if (queue_[s][nt].empty())
+                    continue;
 
                 // Move the live queue into the snapshot (no data copy — O(1)).
                 // queue_ is then reset to an empty set so new entries added by
@@ -213,7 +213,8 @@ class PPRForwardPushState {
         //   c. Enqueue any neighbor whose residual now exceeds the requeue threshold.
         for (int32_t s = 0; s < batch_size_; ++s) {
             for (int32_t nt = 0; nt < num_node_types_; ++nt) {
-                if (queued_nodes_[s][nt].empty()) continue;
+                if (queued_nodes_[s][nt].empty())
+                    continue;
 
                 for (int32_t src : queued_nodes_[s][nt]) {
                     // `auto&` gives a reference to the residual map for this
@@ -222,7 +223,7 @@ class PPRForwardPushState {
                     auto& src_res = residuals_[s][nt];
                     // .find() returns an iterator; .end() means "not found".
                     // We treat a missing entry as residual = 0.
-                    auto it = src_res.find(src);
+                    auto it    = src_res.find(src);
                     double res = (it != src_res.end()) ? it->second : 0.0;
 
                     // a. Absorb: move residual into the PPR score.
@@ -232,7 +233,8 @@ class PPRForwardPushState {
                     int32_t total_deg = get_total_degree(src, nt);
                     // Destination-only nodes (no outgoing edges) absorb residual
                     // into their PPR score but do not push further.
-                    if (total_deg == 0) continue;
+                    if (total_deg == 0)
+                        continue;
 
                     // b. Distribute: each neighbor of src (across all edge types
                     // from nt) receives an equal share of the pushed residual.
@@ -249,18 +251,20 @@ class PPRForwardPushState {
                         // We use a pointer (rather than copying the list) so we can check
                         // for absence with nullptr without allocating anything.
                         const std::vector<int32_t>* nbr_list = nullptr;
-                        auto fi = fetched.find(pack_key(src, eid));
+                        auto fi                              = fetched.find(pack_key(src, eid));
                         if (fi != fetched.end()) {
                             // `&fi->second` takes the address of the vector stored in
                             // the map — nbr_list now points to it without copying.
                             nbr_list = &fi->second;
                         } else {
                             auto ci = neighbor_cache_.find(pack_key(src, eid));
-                            if (ci != neighbor_cache_.end()) nbr_list = &ci->second;
+                            if (ci != neighbor_cache_.end())
+                                nbr_list = &ci->second;
                         }
                         // Skip if no neighbor list is available (node has no edges of
                         // this type, or the fetch returned an empty list).
-                        if (!nbr_list || nbr_list->empty()) continue;
+                        if (!nbr_list || nbr_list->empty())
+                            continue;
 
                         int32_t dst_nt = edge_type_to_dst_ntype_id_[eid];
 
@@ -270,7 +274,7 @@ class PPRForwardPushState {
                             residuals_[s][dst_nt][nbr] += res_per_nbr;
 
                             double threshold = requeue_threshold_factor_ *
-                                static_cast<double>(get_total_degree(nbr, dst_nt));
+                                               static_cast<double>(get_total_degree(nbr, dst_nt));
 
                             // Only enqueue if: (1) not already in queue for this
                             // iteration, and (2) residual exceeds the push threshold
@@ -315,13 +319,14 @@ class PPRForwardPushState {
         std::unordered_set<int32_t> active;
         for (int32_t s = 0; s < batch_size_; ++s)
             for (int32_t nt = 0; nt < num_node_types_; ++nt)
-                if (!ppr_scores_[s][nt].empty()) active.insert(nt);
+                if (!ppr_scores_[s][nt].empty())
+                    active.insert(nt);
 
         py::dict result;
         for (int32_t nt : active) {
             // Flat output vectors — entries for all seeds are concatenated.
             std::vector<int64_t> flat_ids;
-            std::vector<float>   flat_weights;
+            std::vector<float> flat_weights;
             std::vector<int64_t> valid_counts;
 
             for (int32_t s = 0; s < batch_size_; ++s) {
@@ -341,7 +346,8 @@ class PPRForwardPushState {
                     // is an anonymous comparator (like Python's `key=` argument).
                     // `.second` accesses the score (second element of the pair);
                     // `>` makes it descending (highest score first).
-                    std::partial_sort(items.begin(), items.begin() + k, items.end(),
+                    std::partial_sort(
+                        items.begin(), items.begin() + k, items.end(),
                         [](const auto& a, const auto& b) { return a.second > b.second; });
 
                     for (int32_t i = 0; i < k; ++i) {
@@ -355,27 +361,25 @@ class PPRForwardPushState {
             }
 
             // py::make_tuple wraps C++ values into a Python tuple.
-            result[py::int_(nt)] = py::make_tuple(
-                torch::tensor(flat_ids, torch::kLong),
-                torch::tensor(flat_weights, torch::kFloat),
-                torch::tensor(valid_counts, torch::kLong)
-            );
+            result[py::int_(nt)] = py::make_tuple(torch::tensor(flat_ids, torch::kLong),
+                                                  torch::tensor(flat_weights, torch::kFloat),
+                                                  torch::tensor(valid_counts, torch::kLong));
         }
         return result;
     }
 
-private:
+   private:
     // Look up the total (across all edge types) out-degree of a node.
     // Returns 0 for destination-only node types (no outgoing edges).
     int32_t get_total_degree(int32_t node_id, int32_t ntype_id) const {
-        if (ntype_id >= static_cast<int32_t>(degree_tensors_.size())) return 0;
+        if (ntype_id >= static_cast<int32_t>(degree_tensors_.size()))
+            return 0;
         const auto& t = degree_tensors_[ntype_id];
-        if (t.numel() == 0) return 0;  // destination-only type: no outgoing edges
-        TORCH_CHECK(
-            node_id < static_cast<int32_t>(t.size(0)),
-            "Node ID ", node_id, " out of range for degree tensor of ntype_id ", ntype_id,
-            " (size=", t.size(0), "). This indicates corrupted graph data or a sampler bug."
-        );
+        if (t.numel() == 0)
+            return 0;  // destination-only type: no outgoing edges
+        TORCH_CHECK(node_id < static_cast<int32_t>(t.size(0)), "Node ID ", node_id,
+                    " out of range for degree tensor of ntype_id ", ntype_id, " (size=", t.size(0),
+                    "). This indicates corrupted graph data or a sampler bug.");
         // data_ptr<int32_t>() returns a raw C pointer to the tensor's int32 data
         // buffer.  Direct pointer indexing ([node_id]) is safe here because we
         // validated the bounds with TORCH_CHECK above.
@@ -385,13 +389,14 @@ class PPRForwardPushState {
     // -------------------------------------------------------------------------
     // Scalar algorithm parameters
     // -------------------------------------------------------------------------
-    double  alpha_;                       // Restart probability
-    double  one_minus_alpha_;             // 1 - alpha, precomputed to avoid repeated subtraction
-    double  requeue_threshold_factor_;    // alpha * eps; multiplied by degree to get per-node threshold
+    double alpha_;            // Restart probability
+    double one_minus_alpha_;  // 1 - alpha, precomputed to avoid repeated subtraction
+    double
+        requeue_threshold_factor_;  // alpha * eps; multiplied by degree to get per-node threshold
 
-    int32_t batch_size_;                  // Number of seeds in the current batch
-    int32_t num_node_types_;              // Total number of node types (homo + hetero)
-    int32_t num_nodes_in_queue_{0};       // Running count of nodes across all seeds / types
+    int32_t batch_size_;             // Number of seeds in the current batch
+    int32_t num_node_types_;         // Total number of node types (homo + hetero)
+    int32_t num_nodes_in_queue_{0};  // Running count of nodes across all seeds / types
 
     // -------------------------------------------------------------------------
     // Graph structure (read-only after construction)
@@ -400,10 +405,10 @@ class PPRForwardPushState {
     // traversed from that node type (outgoing or incoming, depending on edge_dir).
     std::vector<std::vector<int32_t>> node_type_to_edge_type_ids_;
     // edge_type_to_dst_ntype_id_[eid] → node type ID at the destination end.
-    std::vector<int32_t>              edge_type_to_dst_ntype_id_;
+    std::vector<int32_t> edge_type_to_dst_ntype_id_;
     // degree_tensors_[ntype_id][node_id] → total degree of that node across all
     // edge types traversable from its type.  Empty tensor means no outgoing edges.
-    std::vector<torch::Tensor>        degree_tensors_;
+    std::vector<torch::Tensor> degree_tensors_;
 
     // -------------------------------------------------------------------------
     // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id])
@@ -417,11 +422,11 @@ class PPRForwardPushState {
     // residuals_[s][nt]: node_id → unabsorbed probability mass waiting to be pushed
     std::vector<std::vector<std::unordered_map<int32_t, double>>> residuals_;
     // queue_[s][nt]: nodes whose residual exceeds the threshold and need a push next round
-    std::vector<std::vector<std::unordered_set<int32_t>>>         queue_;
+    std::vector<std::vector<std::unordered_set<int32_t>>> queue_;
     // queued_nodes_[s][nt]: snapshot of queue_ taken by drain_queue() for the current round.
     // Separating it from queue_ lets push_residuals() enqueue new nodes into queue_ without
     // modifying the set currently being iterated.
-    std::vector<std::vector<std::unordered_set<int32_t>>>         queued_nodes_;
+    std::vector<std::vector<std::unordered_set<int32_t>>> queued_nodes_;
 
     // -------------------------------------------------------------------------
     // Neighbor cache
@@ -445,15 +450,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         // .def(py::init<...>()) exposes the constructor.  The template arguments
         // list the exact C++ parameter types so pybind11 can convert Python
         // arguments to the correct C++ types automatically.
-        .def(py::init<
-            torch::Tensor,
-            int32_t,
-            double, double,
-            std::vector<std::vector<int32_t>>,
-            std::vector<int32_t>,
-            std::vector<torch::Tensor>
-        >())
-        .def("drain_queue",    &PPRForwardPushState::drain_queue)
+        .def(py::init<torch::Tensor, int32_t, double, double, std::vector<std::vector<int32_t>>,
+                      std::vector<int32_t>, std::vector<torch::Tensor>>())
+        .def("drain_queue", &PPRForwardPushState::drain_queue)
         .def("push_residuals", &PPRForwardPushState::push_residuals)
-        .def("extract_top_k",  &PPRForwardPushState::extract_top_k);
+        .def("extract_top_k", &PPRForwardPushState::extract_top_k);
 }

From dd118ef09320b0afea72f79b93dbc3d691a6e4be Mon Sep 17 00:00:00 2001
From: mkolodner <mkolodner@snapchat.com>
Date: Wed, 25 Mar 2026 20:49:28 +0000
Subject: [PATCH 06/14] Move PPR C++ to gigl/csrc following PyTorch csrc
 conventions

---
 gigl/csrc/distributed/__init__.py             |   9 +
 gigl/csrc/distributed/ppr_forward_push.cpp    | 247 ++++++++++
 gigl/csrc/distributed/ppr_forward_push.h      | 121 +++++
 .../distributed}/ppr_forward_push.pyi         |   0
 .../distributed/python_ppr_forward_push.cpp   |  63 +++
 gigl/distributed/cpp_extensions/__init__.py   |   9 -
 .../cpp_extensions/ppr_forward_push.cpp       | 458 ------------------
 gigl/distributed/dist_ppr_sampler.py          |   2 +-
 8 files changed, 441 insertions(+), 468 deletions(-)
 create mode 100644 gigl/csrc/distributed/__init__.py
 create mode 100644 gigl/csrc/distributed/ppr_forward_push.cpp
 create mode 100644 gigl/csrc/distributed/ppr_forward_push.h
 rename gigl/{distributed/cpp_extensions => csrc/distributed}/ppr_forward_push.pyi (100%)
 create mode 100644 gigl/csrc/distributed/python_ppr_forward_push.cpp
 delete mode 100644 gigl/distributed/cpp_extensions/__init__.py
 delete mode 100644 gigl/distributed/cpp_extensions/ppr_forward_push.cpp

diff --git a/gigl/csrc/distributed/__init__.py b/gigl/csrc/distributed/__init__.py
new file mode 100644
index 000000000..d8ffa921a
--- /dev/null
+++ b/gigl/csrc/distributed/__init__.py
@@ -0,0 +1,9 @@
+try:
+    from gigl.csrc.distributed.ppr_forward_push import PPRForwardPushState
+except ImportError as e:
+    raise ImportError(
+        "PPR C++ extension not compiled. "
+        "Run `make build_cpp_extensions` from the GiGL root to build it."
+    ) from e
+
+__all__ = ["PPRForwardPushState"]
diff --git a/gigl/csrc/distributed/ppr_forward_push.cpp b/gigl/csrc/distributed/ppr_forward_push.cpp
new file mode 100644
index 000000000..a514907ab
--- /dev/null
+++ b/gigl/csrc/distributed/ppr_forward_push.cpp
@@ -0,0 +1,247 @@
+#include "ppr_forward_push.h"
+
+PPRForwardPushState::PPRForwardPushState(
+    torch::Tensor seed_nodes, int32_t seed_node_type_id, double alpha,
+    double requeue_threshold_factor,
+    std::vector<std::vector<int32_t>> node_type_to_edge_type_ids,
+    std::vector<int32_t> edge_type_to_dst_ntype_id, std::vector<torch::Tensor> degree_tensors)
+    : alpha_(alpha),
+      one_minus_alpha_(1.0 - alpha),
+      requeue_threshold_factor_(requeue_threshold_factor),
+      // std::move transfers ownership of each vector into the member variable
+      // without copying its contents — equivalent to Python's list hand-off
+      // when you no longer need the original.
+      node_type_to_edge_type_ids_(std::move(node_type_to_edge_type_ids)),
+      edge_type_to_dst_ntype_id_(std::move(edge_type_to_dst_ntype_id)),
+      degree_tensors_(std::move(degree_tensors)) {
+    TORCH_CHECK(seed_nodes.dim() == 1, "seed_nodes must be 1D");
+    batch_size_ = static_cast<int32_t>(seed_nodes.size(0));
+    num_node_types_ = static_cast<int32_t>(node_type_to_edge_type_ids_.size());
+
+    // Allocate per-seed, per-node-type tables.
+    // .assign(n, val) fills a vector with n copies of val — like [val] * n in Python.
+    ppr_scores_.assign(batch_size_,
+                       std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
+    residuals_.assign(batch_size_,
+                      std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
+    queue_.assign(batch_size_, std::vector<std::unordered_set<int32_t>>(num_node_types_));
+    queued_nodes_.assign(batch_size_,
+                         std::vector<std::unordered_set<int32_t>>(num_node_types_));
+
+    // accessor<dtype, ndim>() returns a typed view into the tensor's data that
+    // supports [i] indexing with bounds checking in debug builds.
+    auto acc = seed_nodes.accessor<int64_t, 1>();
+    num_nodes_in_queue_ = batch_size_;
+    for (int32_t i = 0; i < batch_size_; ++i) {
+        int32_t seed = static_cast<int32_t>(acc[i]);
+        // PPR initialisation: each seed starts with residual = alpha (the
+        // restart probability).  The first push will move alpha into ppr_score
+        // and distribute (1-alpha)*alpha to the seed's neighbors.
+        residuals_[i][seed_node_type_id][seed] = alpha_;
+        queue_[i][seed_node_type_id].insert(seed);
+    }
+}
+
+std::optional<std::unordered_map<int32_t, torch::Tensor>> PPRForwardPushState::drain_queue() {
+    if (num_nodes_in_queue_ == 0) {
+        return std::nullopt;
+    }
+
+    // Reset the snapshot from the previous iteration.
+    for (int32_t s = 0; s < batch_size_; ++s)
+        for (auto& qs : queued_nodes_[s])
+            qs.clear();
+
+    // nodes_to_lookup[eid] = set of node IDs that need a neighbor fetch for
+    // edge type eid this round.  Using a set deduplicates nodes that appear
+    // in multiple seeds' queues: we only fetch each (node, etype) pair once.
+    std::unordered_map<int32_t, std::unordered_set<int32_t>> nodes_to_lookup;
+
+    for (int32_t s = 0; s < batch_size_; ++s) {
+        for (int32_t nt = 0; nt < num_node_types_; ++nt) {
+            if (queue_[s][nt].empty())
+                continue;
+
+            // Move the live queue into the snapshot (no data copy — O(1)).
+            queued_nodes_[s][nt] = std::move(queue_[s][nt]);
+            queue_[s][nt].clear();
+            num_nodes_in_queue_ -= static_cast<int32_t>(queued_nodes_[s][nt].size());
+
+            for (int32_t node_id : queued_nodes_[s][nt]) {
+                for (int32_t eid : node_type_to_edge_type_ids_[nt]) {
+                    if (neighbor_cache_.find(pack_key(node_id, eid)) == neighbor_cache_.end()) {
+                        nodes_to_lookup[eid].insert(node_id);
+                    }
+                }
+            }
+        }
+    }
+
+    std::unordered_map<int32_t, torch::Tensor> result;
+    for (auto& [eid, node_set] : nodes_to_lookup) {
+        std::vector<int64_t> ids(node_set.begin(), node_set.end());
+        result[eid] = torch::tensor(ids, torch::kLong);
+    }
+    return result;
+}
+
+void PPRForwardPushState::push_residuals(
+    const std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>&
+        fetched_by_etype_id) {
+    // Step 1: Unpack the input map into a C++ map keyed by pack_key(node_id, etype_id)
+    // for fast lookup during the residual-push loop below.
+    std::unordered_map<uint64_t, std::vector<int32_t>> fetched;
+    for (const auto& [eid, tup] : fetched_by_etype_id) {
+        const auto& node_ids_t = std::get<0>(tup);
+        const auto& flat_nbrs_t = std::get<1>(tup);
+        const auto& counts_t = std::get<2>(tup);
+
+        // accessor<int64_t, 1>() gives a bounds-checked, typed 1-D view into
+        // each tensor's data — equivalent to iterating over a NumPy array.
+        auto node_acc = node_ids_t.accessor<int64_t, 1>();
+        auto nbr_acc = flat_nbrs_t.accessor<int64_t, 1>();
+        auto cnt_acc = counts_t.accessor<int64_t, 1>();
+
+        // Walk the flat neighbor list, slicing out each node's neighbors using
+        // the running offset into the concatenated flat buffer.
+        int64_t offset = 0;
+        for (int64_t i = 0; i < node_ids_t.size(0); ++i) {
+            int32_t nid = static_cast<int32_t>(node_acc[i]);
+            int64_t count = cnt_acc[i];
+            std::vector<int32_t> nbrs(count);
+            for (int64_t j = 0; j < count; ++j)
+                nbrs[j] = static_cast<int32_t>(nbr_acc[offset + j]);
+            fetched[pack_key(nid, eid)] = std::move(nbrs);
+            offset += count;
+        }
+    }
+
+    // Step 2: For every node that was in the queue (captured in queued_nodes_
+    // by drain_queue()), apply one PPR push step:
+    //   a. Absorb residual into the PPR score.
+    //   b. Distribute (1-alpha) * residual equally to each neighbor.
+    //   c. Enqueue any neighbor whose residual now exceeds the requeue threshold.
+    for (int32_t s = 0; s < batch_size_; ++s) {
+        for (int32_t nt = 0; nt < num_node_types_; ++nt) {
+            if (queued_nodes_[s][nt].empty())
+                continue;
+
+            for (int32_t src : queued_nodes_[s][nt]) {
+                auto& src_res = residuals_[s][nt];
+                auto it = src_res.find(src);
+                double res = (it != src_res.end()) ? it->second : 0.0;
+
+                // a. Absorb: move residual into the PPR score.
+                ppr_scores_[s][nt][src] += res;
+                src_res[src] = 0.0;
+
+                int32_t total_deg = get_total_degree(src, nt);
+                // Destination-only nodes absorb residual but do not push further.
+                if (total_deg == 0)
+                    continue;
+
+                // b. Distribute: each neighbor receives an equal share.
+                double res_per_nbr = one_minus_alpha_ * res / static_cast<double>(total_deg);
+
+                for (int32_t eid : node_type_to_edge_type_ids_[nt]) {
+                    // Invariant: fetched and neighbor_cache_ are mutually exclusive for
+                    // any given (node, etype) key within one iteration.  drain_queue()
+                    // only requests a fetch for nodes absent from neighbor_cache_, so a
+                    // key is in at most one of the two.
+                    const std::vector<int32_t>* nbr_list = nullptr;
+                    auto fi = fetched.find(pack_key(src, eid));
+                    if (fi != fetched.end()) {
+                        nbr_list = &fi->second;
+                    } else {
+                        auto ci = neighbor_cache_.find(pack_key(src, eid));
+                        if (ci != neighbor_cache_.end())
+                            nbr_list = &ci->second;
+                    }
+                    if (!nbr_list || nbr_list->empty())
+                        continue;
+
+                    int32_t dst_nt = edge_type_to_dst_ntype_id_[eid];
+
+                    // c. Accumulate residual for each neighbor and re-enqueue if threshold
+                    // exceeded.
+                    for (int32_t nbr : *nbr_list) {
+                        residuals_[s][dst_nt][nbr] += res_per_nbr;
+
+                        double threshold = requeue_threshold_factor_ *
+                                           static_cast<double>(get_total_degree(nbr, dst_nt));
+
+                        if (queue_[s][dst_nt].find(nbr) == queue_[s][dst_nt].end() &&
+                            residuals_[s][dst_nt][nbr] >= threshold) {
+                            queue_[s][dst_nt].insert(nbr);
+                            ++num_nodes_in_queue_;
+
+                            // Promote neighbor lists to the persistent cache: this node will
+                            // be processed next iteration, so caching avoids a re-fetch.
+                            for (int32_t peid : node_type_to_edge_type_ids_[dst_nt]) {
+                                uint64_t pk = pack_key(nbr, peid);
+                                if (neighbor_cache_.find(pk) == neighbor_cache_.end()) {
+                                    auto pfi = fetched.find(pk);
+                                    if (pfi != fetched.end())
+                                        neighbor_cache_[pk] = pfi->second;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
+PPRForwardPushState::extract_top_k(int32_t max_ppr_nodes) {
+    std::unordered_set<int32_t> active;
+    for (int32_t s = 0; s < batch_size_; ++s)
+        for (int32_t nt = 0; nt < num_node_types_; ++nt)
+            if (!ppr_scores_[s][nt].empty())
+                active.insert(nt);
+
+    std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> result;
+    for (int32_t nt : active) {
+        std::vector<int64_t> flat_ids;
+        std::vector<float> flat_weights;
+        std::vector<int64_t> valid_counts;
+
+        for (int32_t s = 0; s < batch_size_; ++s) {
+            const auto& scores = ppr_scores_[s][nt];
+            int32_t k = std::min(max_ppr_nodes, static_cast<int32_t>(scores.size()));
+            if (k > 0) {
+                std::vector<std::pair<int32_t, double>> items(scores.begin(), scores.end());
+                std::partial_sort(
+                    items.begin(), items.begin() + k, items.end(),
+                    [](const auto& a, const auto& b) { return a.second > b.second; });
+
+                for (int32_t i = 0; i < k; ++i) {
+                    flat_ids.push_back(static_cast<int64_t>(items[i].first));
+                    // Cast to float32 for output; internal scores stay double to
+                    // avoid accumulated rounding errors in the push loop.
+                    flat_weights.push_back(static_cast<float>(items[i].second));
+                }
+            }
+            valid_counts.push_back(static_cast<int64_t>(k));
+        }
+
+        result[nt] = {torch::tensor(flat_ids, torch::kLong),
+                      torch::tensor(flat_weights, torch::kFloat),
+                      torch::tensor(valid_counts, torch::kLong)};
+    }
+    return result;
+}
+
+int32_t PPRForwardPushState::get_total_degree(int32_t node_id, int32_t ntype_id) const {
+    if (ntype_id >= static_cast<int32_t>(degree_tensors_.size()))
+        return 0;
+    const auto& t = degree_tensors_[ntype_id];
+    if (t.numel() == 0)
+        return 0;
+    TORCH_CHECK(node_id < static_cast<int32_t>(t.size(0)), "Node ID ", node_id,
+                " out of range for degree tensor of ntype_id ", ntype_id, " (size=", t.size(0),
+                "). This indicates corrupted graph data or a sampler bug.");
+    // data_ptr<int32_t>() returns a raw C pointer to the tensor's int32 data buffer.
+    return t.data_ptr<int32_t>()[node_id];
+}
diff --git a/gigl/csrc/distributed/ppr_forward_push.h b/gigl/csrc/distributed/ppr_forward_push.h
new file mode 100644
index 000000000..7f0c92f49
--- /dev/null
+++ b/gigl/csrc/distributed/ppr_forward_push.h
@@ -0,0 +1,121 @@
+#pragma once
+
+#include <torch/torch.h>
+
+#include <algorithm>      // std::partial_sort, std::min
+#include <cstdint>        // Fixed-width integer types: int32_t, int64_t, uint32_t, uint64_t
+#include <optional>       // std::optional for nullable return values
+#include <tuple>          // std::tuple for multi-value returns
+#include <unordered_map>  // std::unordered_map — like Python dict, O(1) average lookup
+#include <unordered_set>  // std::unordered_set — like Python set, O(1) average lookup
+#include <vector>         // std::vector — like Python list, contiguous in memory
+
+// Combine (node_id, etype_id) into a single 64-bit integer for use as a hash
+// map key.  A single 64-bit integer is cheaper to hash than a pair of two
+// integers (std::unordered_map has no built-in pair hash).
+//
+// Bit layout:
+//   bits 63–32: node_id  (upper half)
+//   bits 31– 0: etype_id (lower half)
+//
+// Both inputs are cast through uint32_t before packing.  Without this, a
+// negative int32_t (e.g. -1 = 0xFFFFFFFF) would be sign-extended to a full
+// 64-bit value, corrupting the upper bits when shifted.  Reinterpreting as
+// uint32_t first treats the bit pattern as-is (no sign extension).
+static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) {
+    return (static_cast<uint64_t>(static_cast<uint32_t>(node_id)) << 32) |
+           static_cast<uint32_t>(etype_id);
+}
+
+// C++ kernel for the PPR Forward Push algorithm (Andersen et al., 2006).
+//
+// All hot-loop state (scores, residuals, queue, neighbor cache) lives inside
+// this object.  The distributed neighbor fetch is kept in Python because it
+// involves async RPC calls that C++ cannot drive directly.
+//
+// Owned state: ppr_scores, residuals, queue, queued_nodes, neighbor_cache.
+// Python retains ownership of: the distributed neighbor fetch (_batch_fetch_neighbors).
+//
+// Typical call sequence per batch:
+//   1.  PPRForwardPushState(seed_nodes, ...)   — init per-seed residuals / queue
+//   while True:
+//   2.  drain_queue()                          — drain queue → nodes needing lookup
+//   3.  <Python: _batch_fetch_neighbors(...)>  — distributed RPC fetch (stays in Python)
+//   4.  push_residuals(fetched_by_etype_id)    — push residuals, update queue
+//   5.  extract_top_k(max_ppr_nodes)           — top-k selection per seed per node type
+class PPRForwardPushState {
+   public:
+    PPRForwardPushState(torch::Tensor seed_nodes, int32_t seed_node_type_id, double alpha,
+                        double requeue_threshold_factor,
+                        std::vector<std::vector<int32_t>> node_type_to_edge_type_ids,
+                        std::vector<int32_t> edge_type_to_dst_ntype_id,
+                        std::vector<torch::Tensor> degree_tensors);
+
+    // Drain all queued nodes and return {etype_id: tensor[node_ids]} for batch
+    // neighbor lookup.  Also snapshots the drained nodes into queued_nodes_ for
+    // use by push_residuals().
+    //
+    // Return value semantics:
+    //   - std::nullopt   → queue was already empty; convergence achieved; stop the loop.
+    //   - empty map      → nodes were drained but all were cached; call push_residuals({}).
+    //   - non-empty map  → {etype_id → 1-D int64 tensor of node IDs} needing neighbor lookup.
+    std::optional<std::unordered_map<int32_t, torch::Tensor>> drain_queue();
+
+    // Push residuals to neighbors given the fetched neighbor data.
+    //
+    // fetched_by_etype_id: {etype_id: (node_ids_tensor, flat_nbrs_tensor, counts_tensor)}
+    //   - node_ids_tensor:  [N]           int64 — source node IDs fetched for this edge type
+    //   - flat_nbrs_tensor: [sum(counts)] int64 — all neighbor lists concatenated flat
+    //   - counts_tensor:    [N]           int64 — neighbor count for each source node
+    void push_residuals(const std::unordered_map<
+                        int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>&
+                            fetched_by_etype_id);
+
+    // Extract top-k PPR nodes per seed per node type.
+    //
+    // Returns {ntype_id: (flat_ids_tensor, flat_weights_tensor, valid_counts_tensor)}.
+    // Only node types that received any PPR score are included in the output.
+    //
+    // Output layout for a batch of B seeds:
+    //   flat_ids[0 : valid_counts[0]]                 → top-k nodes for seed 0
+    //   flat_ids[valid_counts[0] : valid_counts[0]+valid_counts[1]] → top-k for seed 1
+    //   ...
+    std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
+    extract_top_k(int32_t max_ppr_nodes);
+
+   private:
+    // Look up the total (across all edge types) out-degree of a node.
+    // Returns 0 for destination-only node types (no outgoing edges).
+    int32_t get_total_degree(int32_t node_id, int32_t ntype_id) const;
+
+    // -------------------------------------------------------------------------
+    // Scalar algorithm parameters
+    // -------------------------------------------------------------------------
+    double alpha_;            // Restart probability
+    double one_minus_alpha_;  // 1 - alpha, precomputed to avoid repeated subtraction
+    double requeue_threshold_factor_;  // alpha * eps; multiplied by degree to get per-node threshold
+
+    int32_t batch_size_;             // Number of seeds in the current batch
+    int32_t num_node_types_;         // Total number of node types (homo + hetero)
+    int32_t num_nodes_in_queue_{0};  // Running count of nodes across all seeds / types
+
+    // -------------------------------------------------------------------------
+    // Graph structure (read-only after construction)
+    // -------------------------------------------------------------------------
+    std::vector<std::vector<int32_t>> node_type_to_edge_type_ids_;
+    std::vector<int32_t> edge_type_to_dst_ntype_id_;
+    std::vector<torch::Tensor> degree_tensors_;
+
+    // -------------------------------------------------------------------------
+    // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id])
+    // -------------------------------------------------------------------------
+    std::vector<std::vector<std::unordered_map<int32_t, double>>> ppr_scores_;
+    std::vector<std::vector<std::unordered_map<int32_t, double>>> residuals_;
+    std::vector<std::vector<std::unordered_set<int32_t>>> queue_;
+    std::vector<std::vector<std::unordered_set<int32_t>>> queued_nodes_;
+
+    // -------------------------------------------------------------------------
+    // Neighbor cache
+    // -------------------------------------------------------------------------
+    std::unordered_map<uint64_t, std::vector<int32_t>> neighbor_cache_;
+};
diff --git a/gigl/distributed/cpp_extensions/ppr_forward_push.pyi b/gigl/csrc/distributed/ppr_forward_push.pyi
similarity index 100%
rename from gigl/distributed/cpp_extensions/ppr_forward_push.pyi
rename to gigl/csrc/distributed/ppr_forward_push.pyi
diff --git a/gigl/csrc/distributed/python_ppr_forward_push.cpp b/gigl/csrc/distributed/python_ppr_forward_push.cpp
new file mode 100644
index 000000000..ebf3fa27a
--- /dev/null
+++ b/gigl/csrc/distributed/python_ppr_forward_push.cpp
@@ -0,0 +1,63 @@
+// Python bindings for PPRForwardPushState.
+//
+// Follows PyTorch's csrc convention: pure C++ algorithm lives in
+// ppr_forward_push.{h,cpp}; this file only handles type conversion between
+// Python (pybind11) and C++ types, then delegates to the C++ implementation.
+
+#include <pybind11/stl.h>
+#include <torch/extension.h>
+
+#include "ppr_forward_push.h"
+
+namespace py = pybind11;
+
+// drain_queue: C++ returns std::optional<map<etype_id, Tensor>>.
+// Exposed to Python as: None (convergence) or dict[int, Tensor].
+static py::object drain_queue_wrapper(PPRForwardPushState& self) {
+    auto result = self.drain_queue();
+    if (!result) {
+        return py::none();
+    }
+    py::dict d;
+    for (auto& [eid, tensor] : *result) {
+        d[py::int_(eid)] = tensor;
+    }
+    return d;
+}
+
+// push_residuals: Python passes dict[int, tuple[Tensor, Tensor, Tensor]].
+// Convert to C++ map before delegating.
+static void push_residuals_wrapper(PPRForwardPushState& self, py::dict fetched_by_etype_id) {
+    std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> cpp_map;
+    for (auto item : fetched_by_etype_id) {
+        int32_t eid = item.first.cast<int32_t>();
+        auto tup = item.second.cast<py::tuple>();
+        cpp_map[eid] = {tup[0].cast<torch::Tensor>(), tup[1].cast<torch::Tensor>(),
+                        tup[2].cast<torch::Tensor>()};
+    }
+    self.push_residuals(cpp_map);
+}
+
+// extract_top_k: C++ returns map<ntype_id, tuple<Tensor, Tensor, Tensor>>.
+// Exposed to Python as dict[int, tuple[Tensor, Tensor, Tensor]].
+static py::dict extract_top_k_wrapper(PPRForwardPushState& self, int32_t max_ppr_nodes) {
+    auto result = self.extract_top_k(max_ppr_nodes);
+    py::dict d;
+    for (auto& [nt, tup] : result) {
+        d[py::int_(nt)] =
+            py::make_tuple(std::get<0>(tup), std::get<1>(tup), std::get<2>(tup));
+    }
+    return d;
+}
+
+// TORCH_EXTENSION_NAME is set by PyTorch's build system to match the Python
+// module name derived from this file's path (e.g. "ppr_forward_push").
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    py::class_<PPRForwardPushState>(m, "PPRForwardPushState")
+        .def(py::init<torch::Tensor, int32_t, double, double,
+                      std::vector<std::vector<int32_t>>, std::vector<int32_t>,
+                      std::vector<torch::Tensor>>())
+        .def("drain_queue", drain_queue_wrapper)
+        .def("push_residuals", push_residuals_wrapper)
+        .def("extract_top_k", extract_top_k_wrapper);
+}
diff --git a/gigl/distributed/cpp_extensions/__init__.py b/gigl/distributed/cpp_extensions/__init__.py
deleted file mode 100644
index d375f59b1..000000000
--- a/gigl/distributed/cpp_extensions/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-try:
-    from gigl.distributed.cpp_extensions.ppr_forward_push import PPRForwardPushState
-except ImportError as e:
-    raise ImportError(
-        "PPR C++ extension not compiled. "
-        "Run `uv pip install -e .` from the GiGL root to build it."
-    ) from e
-
-__all__ = ["PPRForwardPushState"]
diff --git a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp
deleted file mode 100644
index 0af3eb2b5..000000000
--- a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp
+++ /dev/null
@@ -1,458 +0,0 @@
-#include <pybind11/stl.h>     // Automatic conversion between C++ containers and Python types
-#include <torch/extension.h>  // PyTorch C++ API (tensors, TORCH_CHECK)
-
-#include <algorithm>      // std::partial_sort, std::min
-#include <cstdint>        // Fixed-width integer types: int32_t, int64_t, uint32_t, uint64_t
-#include <unordered_map>  // std::unordered_map — like Python dict, O(1) average lookup
-#include <unordered_set>  // std::unordered_set — like Python set, O(1) average lookup
-#include <vector>         // std::vector — like Python list, contiguous in memory
-
-namespace py = pybind11;  // Alias for the pybind11 namespace (bridges C++ ↔ Python)
-
-// Combine (node_id, etype_id) into a single 64-bit integer for use as a hash
-// map key.  A single 64-bit integer is cheaper to hash than a pair of two
-// integers (std::unordered_map has no built-in pair hash).
-//
-// Bit layout:
-//   bits 63–32: node_id  (upper half)
-//   bits 31– 0: etype_id (lower half)
-//
-// Both inputs are cast through uint32_t before packing.  Without this, a
-// negative int32_t (e.g. -1 = 0xFFFFFFFF) would be sign-extended to a full
-// 64-bit value, corrupting the upper bits when shifted.  Reinterpreting as
-// uint32_t first treats the bit pattern as-is (no sign extension).
-//
-// `static inline` means: define this function here in the translation unit
-// (not in a separate object file) and ask the compiler to inline it at each
-// call site instead of generating a function call.
-static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) {
-    return (static_cast<uint64_t>(static_cast<uint32_t>(node_id)) << 32) |
-           static_cast<uint32_t>(etype_id);
-}
-
-// C++ kernel for the PPR Forward Push algorithm (Andersen et al., 2006).
-//
-// All hot-loop state (scores, residuals, queue, neighbor cache) lives inside
-// this object.  The distributed neighbor fetch is kept in Python because it
-// involves async RPC calls that C++ cannot drive directly.
-//
-// Owned state: ppr_scores, residuals, queue, queued_nodes, neighbor_cache.
-// Python retains ownership of: the distributed neighbor fetch (_batch_fetch_neighbors).
-//
-// Typical call sequence per batch:
-//   1.  PPRForwardPushState(seed_nodes, ...)   — init per-seed residuals / queue
-//   while True:
-//   2.  drain_queue()                          — drain queue → nodes needing lookup
-//   3.  <Python: _batch_fetch_neighbors(...)>  — distributed RPC fetch (stays in Python)
-//   4.  push_residuals(fetched_by_etype_id)    — push residuals, update queue
-//   5.  extract_top_k(max_ppr_nodes)           — top-k selection per seed per node type
-class PPRForwardPushState {
-   public:
-    PPRForwardPushState(torch::Tensor seed_nodes, int32_t seed_node_type_id, double alpha,
-                        double requeue_threshold_factor,
-                        std::vector<std::vector<int32_t>> node_type_to_edge_type_ids,
-                        std::vector<int32_t> edge_type_to_dst_ntype_id,
-                        std::vector<torch::Tensor> degree_tensors)
-        : alpha_(alpha),
-          one_minus_alpha_(1.0 - alpha),
-          requeue_threshold_factor_(requeue_threshold_factor),
-          // std::move transfers ownership of each vector into the member variable
-          // without copying its contents — equivalent to Python's list hand-off
-          // when you no longer need the original.
-          node_type_to_edge_type_ids_(std::move(node_type_to_edge_type_ids)),
-          edge_type_to_dst_ntype_id_(std::move(edge_type_to_dst_ntype_id)),
-          degree_tensors_(std::move(degree_tensors)) {
-        TORCH_CHECK(seed_nodes.dim() == 1, "seed_nodes must be 1D");
-        batch_size_     = static_cast<int32_t>(seed_nodes.size(0));
-        num_node_types_ = static_cast<int32_t>(node_type_to_edge_type_ids_.size());
-
-        // Allocate per-seed, per-node-type tables.
-        // .assign(n, val) fills a vector with n copies of val — like [val] * n in Python.
-        // Each inner element is an empty hash map / hash set for that (seed, ntype) pair.
-        ppr_scores_.assign(batch_size_,
-                           std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
-        residuals_.assign(batch_size_,
-                          std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
-        queue_.assign(batch_size_, std::vector<std::unordered_set<int32_t>>(num_node_types_));
-        queued_nodes_.assign(batch_size_,
-                             std::vector<std::unordered_set<int32_t>>(num_node_types_));
-
-        // accessor<dtype, ndim>() returns a typed view into the tensor's data that
-        // supports [i] indexing with bounds checking in debug builds.  Here we read
-        // each seed node ID from the 1-D int64 tensor.
-        auto acc            = seed_nodes.accessor<int64_t, 1>();
-        num_nodes_in_queue_ = batch_size_;
-        for (int32_t i = 0; i < batch_size_; ++i) {
-            // static_cast<int32_t>: explicit narrowing from int64 to int32.
-            // The Python caller guarantees node IDs fit in 32 bits.
-            int32_t seed = static_cast<int32_t>(acc[i]);
-            // PPR initialisation: each seed starts with residual = alpha (the
-            // restart probability).  The first push will move alpha into ppr_score
-            // and distribute (1-alpha)*alpha to the seed's neighbors.
-            residuals_[i][seed_node_type_id][seed] = alpha_;
-            queue_[i][seed_node_type_id].insert(seed);
-        }
-    }
-
-    // Drain all queued nodes and return {etype_id: tensor[node_ids]} for batch
-    // neighbor lookup.  Also snapshots the drained nodes into queued_nodes_ for
-    // use by push_residuals().
-    //
-    // Return value semantics (py::object can hold any Python value):
-    //   - py::none()  → queue was already empty; convergence achieved; stop the loop.
-    //   - py::dict{}  → nodes were drained.  The dict maps etype_id → 1-D int64
-    //                   tensor of node IDs that need neighbor lookups this round.
-    //                   May be empty if all drained nodes were already in the cache
-    //                   or had no outgoing edges — push_residuals must still be called
-    //                   to flush their accumulated residual into ppr_scores_.
-    py::object drain_queue() {
-        if (num_nodes_in_queue_ == 0) {
-            return py::none();
-        }
-
-        // Reset the snapshot from the previous iteration.  `auto&` is a reference
-        // (alias) to the existing set — clearing it modifies the original in-place
-        // rather than operating on a copy.
-        for (int32_t s = 0; s < batch_size_; ++s)
-            for (auto& qs : queued_nodes_[s])
-                qs.clear();
-
-        // nodes_to_lookup[eid] = set of node IDs that need a neighbor fetch for
-        // edge type eid this round.  Using a set deduplicates nodes that appear
-        // in multiple seeds' queues: we only fetch each (node, etype) pair once
-        // regardless of how many seeds need it.
-        std::unordered_map<int32_t, std::unordered_set<int32_t>> nodes_to_lookup;
-
-        for (int32_t s = 0; s < batch_size_; ++s) {
-            for (int32_t nt = 0; nt < num_node_types_; ++nt) {
-                if (queue_[s][nt].empty())
-                    continue;
-
-                // Move the live queue into the snapshot (no data copy — O(1)).
-                // queue_ is then reset to an empty set so new entries added by
-                // push_residuals() in this same iteration don't interfere.
-                queued_nodes_[s][nt] = std::move(queue_[s][nt]);
-                queue_[s][nt].clear();
-                num_nodes_in_queue_ -= static_cast<int32_t>(queued_nodes_[s][nt].size());
-
-                for (int32_t node_id : queued_nodes_[s][nt]) {
-                    for (int32_t eid : node_type_to_edge_type_ids_[nt]) {
-                        // Only request a fetch if the neighbor list isn't already
-                        // cached from a previous iteration.
-                        if (neighbor_cache_.find(pack_key(node_id, eid)) == neighbor_cache_.end()) {
-                            nodes_to_lookup[eid].insert(node_id);
-                        }
-                    }
-                }
-            }
-        }
-
-        // Convert to Python: {etype_id (int) → 1-D int64 tensor of node IDs}.
-        // py::int_(eid) wraps a C++ int as a Python int so it can be used as a
-        // dict key on the Python side.
-        py::dict result;
-        for (auto& [eid, node_set] : nodes_to_lookup) {
-            // Copy the set into a vector first: torch::tensor() requires a
-            // contiguous sequence, not an unordered_set iterator.
-            std::vector<int64_t> ids(node_set.begin(), node_set.end());
-            result[py::int_(eid)] = torch::tensor(ids, torch::kLong);
-        }
-        return result;
-    }
-
-    // Push residuals to neighbors given the fetched neighbor data.
-    //
-    // fetched_by_etype_id: {etype_id: (node_ids_tensor, flat_nbrs_tensor, counts_tensor)}
-    //   - node_ids_tensor:  [N]           int64 — source node IDs fetched for this edge type
-    //   - flat_nbrs_tensor: [sum(counts)] int64 — all neighbor lists concatenated flat
-    //   - counts_tensor:    [N]           int64 — neighbor count for each source node
-    //
-    // For example, if nodes 3 and 7 were fetched for etype 0:
-    //   node_ids  = [3, 7]
-    //   flat_nbrs = [10, 11, 12, 20]   ← node 3 has nbrs {10,11,12}, node 7 has nbr {20}
-    //   counts    = [3, 1]
-    void push_residuals(py::dict fetched_by_etype_id) {
-        // Step 1: Unpack the Python dict into a C++ map for fast lookup during
-        // the residual-push loop below.
-        // fetched: pack_key(node_id, etype_id) → neighbor list (as int32_t vector)
-        std::unordered_map<uint64_t, std::vector<int32_t>> fetched;
-        for (auto item : fetched_by_etype_id) {
-            int32_t eid = item.first.cast<int32_t>();
-            // .cast<py::tuple>() interprets the Python value as a tuple so we
-            // can index into it with [0], [1], [2].
-            auto tup         = item.second.cast<py::tuple>();
-            auto node_ids_t  = tup[0].cast<torch::Tensor>();
-            auto flat_nbrs_t = tup[1].cast<torch::Tensor>();
-            auto counts_t    = tup[2].cast<torch::Tensor>();
-
-            // accessor<int64_t, 1>() gives a bounds-checked, typed 1-D view into
-            // each tensor's data — equivalent to iterating over a NumPy array.
-            auto node_acc = node_ids_t.accessor<int64_t, 1>();
-            auto nbr_acc  = flat_nbrs_t.accessor<int64_t, 1>();
-            auto cnt_acc  = counts_t.accessor<int64_t, 1>();
-
-            // Walk the flat neighbor list, slicing out each node's neighbors using
-            // the running offset into the concatenated flat buffer.
-            int64_t offset = 0;
-            for (int64_t i = 0; i < node_ids_t.size(0); ++i) {
-                int32_t nid   = static_cast<int32_t>(node_acc[i]);
-                int64_t count = cnt_acc[i];
-                std::vector<int32_t> nbrs(count);
-                for (int64_t j = 0; j < count; ++j)
-                    nbrs[j] = static_cast<int32_t>(nbr_acc[offset + j]);
-                // std::move: hand off nbrs to the map without copying its contents.
-                fetched[pack_key(nid, eid)] = std::move(nbrs);
-                offset += count;
-            }
-        }
-
-        // Step 2: For every node that was in the queue (captured in queued_nodes_
-        // by drain_queue()), apply one PPR push step:
-        //   a. Absorb residual into the PPR score.
-        //   b. Distribute (1-alpha) * residual equally to each neighbor.
-        //   c. Enqueue any neighbor whose residual now exceeds the requeue threshold.
-        for (int32_t s = 0; s < batch_size_; ++s) {
-            for (int32_t nt = 0; nt < num_node_types_; ++nt) {
-                if (queued_nodes_[s][nt].empty())
-                    continue;
-
-                for (int32_t src : queued_nodes_[s][nt]) {
-                    // `auto&` gives a reference to the residual map for this
-                    // (seed, node_type) pair so we can read and write it without
-                    // an extra hash lookup each time.
-                    auto& src_res = residuals_[s][nt];
-                    // .find() returns an iterator; .end() means "not found".
-                    // We treat a missing entry as residual = 0.
-                    auto it    = src_res.find(src);
-                    double res = (it != src_res.end()) ? it->second : 0.0;
-
-                    // a. Absorb: move residual into the PPR score.
-                    ppr_scores_[s][nt][src] += res;
-                    src_res[src] = 0.0;
-
-                    int32_t total_deg = get_total_degree(src, nt);
-                    // Destination-only nodes (no outgoing edges) absorb residual
-                    // into their PPR score but do not push further.
-                    if (total_deg == 0)
-                        continue;
-
-                    // b. Distribute: each neighbor of src (across all edge types
-                    // from nt) receives an equal share of the pushed residual.
-                    double res_per_nbr = one_minus_alpha_ * res / static_cast<double>(total_deg);
-
-                    for (int32_t eid : node_type_to_edge_type_ids_[nt]) {
-                        // Invariant: fetched and neighbor_cache_ are mutually exclusive for
-                        // any given (node, etype) key within one iteration.  drain_queue()
-                        // only requests a fetch for nodes absent from neighbor_cache_, so a
-                        // key is in at most one of the two.  We check fetched first since it
-                        // is the common case for newly-seen nodes.
-                        //
-                        // `const std::vector<int32_t>*` is a pointer to a neighbor list.
-                        // We use a pointer (rather than copying the list) so we can check
-                        // for absence with nullptr without allocating anything.
-                        const std::vector<int32_t>* nbr_list = nullptr;
-                        auto fi                              = fetched.find(pack_key(src, eid));
-                        if (fi != fetched.end()) {
-                            // `&fi->second` takes the address of the vector stored in
-                            // the map — nbr_list now points to it without copying.
-                            nbr_list = &fi->second;
-                        } else {
-                            auto ci = neighbor_cache_.find(pack_key(src, eid));
-                            if (ci != neighbor_cache_.end())
-                                nbr_list = &ci->second;
-                        }
-                        // Skip if no neighbor list is available (node has no edges of
-                        // this type, or the fetch returned an empty list).
-                        if (!nbr_list || nbr_list->empty())
-                            continue;
-
-                        int32_t dst_nt = edge_type_to_dst_ntype_id_[eid];
-
-                        // c. For each neighbor, accumulate residual and check threshold.
-                        // `*nbr_list` dereferences the pointer to access the vector.
-                        for (int32_t nbr : *nbr_list) {
-                            residuals_[s][dst_nt][nbr] += res_per_nbr;
-
-                            double threshold = requeue_threshold_factor_ *
-                                               static_cast<double>(get_total_degree(nbr, dst_nt));
-
-                            // Only enqueue if: (1) not already in queue for this
-                            // iteration, and (2) residual exceeds the push threshold
-                            // alpha * eps * degree.
-                            if (queue_[s][dst_nt].find(nbr) == queue_[s][dst_nt].end() &&
-                                residuals_[s][dst_nt][nbr] >= threshold) {
-                                queue_[s][dst_nt].insert(nbr);
-                                ++num_nodes_in_queue_;  // ++x is equivalent to x += 1
-
-                                // Promote this node's neighbor lists to the persistent cache:
-                                // it will be processed next iteration, so caching now avoids
-                                // a re-fetch.  Nodes that are never requeued (typically
-                                // high-degree) are never promoted, keeping their large neighbor
-                                // lists out of the cache.
-                                for (int32_t peid : node_type_to_edge_type_ids_[dst_nt]) {
-                                    uint64_t pk = pack_key(nbr, peid);
-                                    if (neighbor_cache_.find(pk) == neighbor_cache_.end()) {
-                                        auto pfi = fetched.find(pk);
-                                        if (pfi != fetched.end())
-                                            neighbor_cache_[pk] = pfi->second;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    // Extract top-k PPR nodes per seed per node type.
-    //
-    // Returns {ntype_id: (flat_ids_tensor, flat_weights_tensor, valid_counts_tensor)}.
-    // Only node types that received any PPR score are included in the output.
-    //
-    // Output layout for a batch of B seeds (same structure as _batch_fetch_neighbors):
-    //   flat_ids[0 : valid_counts[0]]                 → top-k nodes for seed 0
-    //   flat_ids[valid_counts[0] : valid_counts[0]+valid_counts[1]] → top-k for seed 1
-    //   ...
-    py::dict extract_top_k(int32_t max_ppr_nodes) {
-        // Collect node types that have any PPR score — skip types with no activity.
-        std::unordered_set<int32_t> active;
-        for (int32_t s = 0; s < batch_size_; ++s)
-            for (int32_t nt = 0; nt < num_node_types_; ++nt)
-                if (!ppr_scores_[s][nt].empty())
-                    active.insert(nt);
-
-        py::dict result;
-        for (int32_t nt : active) {
-            // Flat output vectors — entries for all seeds are concatenated.
-            std::vector<int64_t> flat_ids;
-            std::vector<float> flat_weights;
-            std::vector<int64_t> valid_counts;
-
-            for (int32_t s = 0; s < batch_size_; ++s) {
-                // `const auto&` is a read-only reference — we iterate the map
-                // without copying it.
-                const auto& scores = ppr_scores_[s][nt];
-                // Cap k at the number of nodes that actually have a score.
-                int32_t k = std::min(max_ppr_nodes, static_cast<int32_t>(scores.size()));
-                if (k > 0) {
-                    // Copy the map entries into a vector of (node_id, score) pairs
-                    // so they can be sorted.  std::pair is like a Python 2-tuple.
-                    std::vector<std::pair<int32_t, double>> items(scores.begin(), scores.end());
-
-                    // std::partial_sort rearranges items so that the first k entries
-                    // are the k largest — like Python's heapq.nlargest but in-place.
-                    // The lambda `[](const auto& a, const auto& b) { return ...; }`
-                    // is an anonymous comparator (like Python's `key=` argument).
-                    // `.second` accesses the score (second element of the pair);
-                    // `>` makes it descending (highest score first).
-                    std::partial_sort(
-                        items.begin(), items.begin() + k, items.end(),
-                        [](const auto& a, const auto& b) { return a.second > b.second; });
-
-                    for (int32_t i = 0; i < k; ++i) {
-                        flat_ids.push_back(static_cast<int64_t>(items[i].first));
-                        // Cast to float32 for output; internal scores stay double to
-                        // avoid accumulated rounding errors in the push loop above.
-                        flat_weights.push_back(static_cast<float>(items[i].second));
-                    }
-                }
-                valid_counts.push_back(static_cast<int64_t>(k));
-            }
-
-            // py::make_tuple wraps C++ values into a Python tuple.
-            result[py::int_(nt)] = py::make_tuple(torch::tensor(flat_ids, torch::kLong),
-                                                  torch::tensor(flat_weights, torch::kFloat),
-                                                  torch::tensor(valid_counts, torch::kLong));
-        }
-        return result;
-    }
-
-   private:
-    // Look up the total (across all edge types) out-degree of a node.
-    // Returns 0 for destination-only node types (no outgoing edges).
-    int32_t get_total_degree(int32_t node_id, int32_t ntype_id) const {
-        if (ntype_id >= static_cast<int32_t>(degree_tensors_.size()))
-            return 0;
-        const auto& t = degree_tensors_[ntype_id];
-        if (t.numel() == 0)
-            return 0;  // destination-only type: no outgoing edges
-        TORCH_CHECK(node_id < static_cast<int32_t>(t.size(0)), "Node ID ", node_id,
-                    " out of range for degree tensor of ntype_id ", ntype_id, " (size=", t.size(0),
-                    "). This indicates corrupted graph data or a sampler bug.");
-        // data_ptr<int32_t>() returns a raw C pointer to the tensor's int32 data
-        // buffer.  Direct pointer indexing ([node_id]) is safe here because we
-        // validated the bounds with TORCH_CHECK above.
-        return t.data_ptr<int32_t>()[node_id];
-    }
-
-    // -------------------------------------------------------------------------
-    // Scalar algorithm parameters
-    // -------------------------------------------------------------------------
-    double alpha_;            // Restart probability
-    double one_minus_alpha_;  // 1 - alpha, precomputed to avoid repeated subtraction
-    double
-        requeue_threshold_factor_;  // alpha * eps; multiplied by degree to get per-node threshold
-
-    int32_t batch_size_;             // Number of seeds in the current batch
-    int32_t num_node_types_;         // Total number of node types (homo + hetero)
-    int32_t num_nodes_in_queue_{0};  // Running count of nodes across all seeds / types
-
-    // -------------------------------------------------------------------------
-    // Graph structure (read-only after construction)
-    // -------------------------------------------------------------------------
-    // node_type_to_edge_type_ids_[ntype_id] → list of edge type IDs that can be
-    // traversed from that node type (outgoing or incoming, depending on edge_dir).
-    std::vector<std::vector<int32_t>> node_type_to_edge_type_ids_;
-    // edge_type_to_dst_ntype_id_[eid] → node type ID at the destination end.
-    std::vector<int32_t> edge_type_to_dst_ntype_id_;
-    // degree_tensors_[ntype_id][node_id] → total degree of that node across all
-    // edge types traversable from its type.  Empty tensor means no outgoing edges.
-    std::vector<torch::Tensor> degree_tensors_;
-
-    // -------------------------------------------------------------------------
-    // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id])
-    // -------------------------------------------------------------------------
-    // double precision avoids float32 rounding errors accumulating over 20-30
-    // push iterations, which would otherwise cause ~1e-4 score errors vs the
-    // true PPR.  Output weights are cast to float32 in extract_top_k.
-    //
-    // ppr_scores_[s][nt]: node_id → absorbed PPR score (Σ of residuals pushed so far)
-    std::vector<std::vector<std::unordered_map<int32_t, double>>> ppr_scores_;
-    // residuals_[s][nt]: node_id → unabsorbed probability mass waiting to be pushed
-    std::vector<std::vector<std::unordered_map<int32_t, double>>> residuals_;
-    // queue_[s][nt]: nodes whose residual exceeds the threshold and need a push next round
-    std::vector<std::vector<std::unordered_set<int32_t>>> queue_;
-    // queued_nodes_[s][nt]: snapshot of queue_ taken by drain_queue() for the current round.
-    // Separating it from queue_ lets push_residuals() enqueue new nodes into queue_ without
-    // modifying the set currently being iterated.
-    std::vector<std::vector<std::unordered_set<int32_t>>> queued_nodes_;
-
-    // -------------------------------------------------------------------------
-    // Neighbor cache
-    // -------------------------------------------------------------------------
-    // Persistent cache: pack_key(node_id, etype_id) → neighbor list.
-    // Only nodes that have been re-queued (and will therefore be processed again)
-    // are promoted here from the per-iteration fetched map in push_residuals().
-    // This avoids re-fetching neighbors for nodes processed in multiple iterations
-    // while keeping large neighbor lists of high-degree (never-requeued) nodes
-    // out of memory.
-    std::unordered_map<uint64_t, std::vector<int32_t>> neighbor_cache_;
-};
-
-// Register PPRForwardPushState with Python via pybind11.
-//
-// TORCH_EXTENSION_NAME is set by PyTorch's setup() at build time to match the
-// Python module name (e.g. "ppr_forward_push").  At import time, Python calls
-// this function to populate the module with the C++ class.
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    py::class_<PPRForwardPushState>(m, "PPRForwardPushState")
-        // .def(py::init<...>()) exposes the constructor.  The template arguments
-        // list the exact C++ parameter types so pybind11 can convert Python
-        // arguments to the correct C++ types automatically.
-        .def(py::init<torch::Tensor, int32_t, double, double, std::vector<std::vector<int32_t>>,
-                      std::vector<int32_t>, std::vector<torch::Tensor>>())
-        .def("drain_queue", &PPRForwardPushState::drain_queue)
-        .def("push_residuals", &PPRForwardPushState::push_residuals)
-        .def("extract_top_k", &PPRForwardPushState::extract_top_k);
-}
diff --git a/gigl/distributed/dist_ppr_sampler.py b/gigl/distributed/dist_ppr_sampler.py
index b63329357..6285d67e6 100644
--- a/gigl/distributed/dist_ppr_sampler.py
+++ b/gigl/distributed/dist_ppr_sampler.py
@@ -14,7 +14,7 @@
 from graphlearn_torch.typing import EdgeType, NodeType
 from graphlearn_torch.utils import merge_dict
 
-from gigl.distributed.cpp_extensions import PPRForwardPushState
+from gigl.csrc.distributed import PPRForwardPushState
 from gigl.distributed.dist_neighbor_sampler import DistNeighborSampler
 from gigl.types.graph import is_label_edge_type
 

From c66a6e53a6e7cda2564394a2ffe61e59cc962125 Mon Sep 17 00:00:00 2001
From: mkolodner <mkolodner@snapchat.com>
Date: Wed, 25 Mar 2026 22:21:38 +0000
Subject: [PATCH 07/14] Update

---
 gigl/csrc/{distributed => sampling}/__init__.py                 | 2 +-
 gigl/csrc/{distributed => sampling}/ppr_forward_push.cpp        | 0
 gigl/csrc/{distributed => sampling}/ppr_forward_push.h          | 0
 gigl/csrc/{distributed => sampling}/ppr_forward_push.pyi        | 0
 gigl/csrc/{distributed => sampling}/python_ppr_forward_push.cpp | 0
 gigl/distributed/dist_ppr_sampler.py                            | 2 +-
 6 files changed, 2 insertions(+), 2 deletions(-)
 rename gigl/csrc/{distributed => sampling}/__init__.py (74%)
 rename gigl/csrc/{distributed => sampling}/ppr_forward_push.cpp (100%)
 rename gigl/csrc/{distributed => sampling}/ppr_forward_push.h (100%)
 rename gigl/csrc/{distributed => sampling}/ppr_forward_push.pyi (100%)
 rename gigl/csrc/{distributed => sampling}/python_ppr_forward_push.cpp (100%)

diff --git a/gigl/csrc/distributed/__init__.py b/gigl/csrc/sampling/__init__.py
similarity index 74%
rename from gigl/csrc/distributed/__init__.py
rename to gigl/csrc/sampling/__init__.py
index d8ffa921a..b2e23ba6c 100644
--- a/gigl/csrc/distributed/__init__.py
+++ b/gigl/csrc/sampling/__init__.py
@@ -1,5 +1,5 @@
 try:
-    from gigl.csrc.distributed.ppr_forward_push import PPRForwardPushState
+    from gigl.csrc.sampling.ppr_forward_push import PPRForwardPushState
 except ImportError as e:
     raise ImportError(
         "PPR C++ extension not compiled. "
diff --git a/gigl/csrc/distributed/ppr_forward_push.cpp b/gigl/csrc/sampling/ppr_forward_push.cpp
similarity index 100%
rename from gigl/csrc/distributed/ppr_forward_push.cpp
rename to gigl/csrc/sampling/ppr_forward_push.cpp
diff --git a/gigl/csrc/distributed/ppr_forward_push.h b/gigl/csrc/sampling/ppr_forward_push.h
similarity index 100%
rename from gigl/csrc/distributed/ppr_forward_push.h
rename to gigl/csrc/sampling/ppr_forward_push.h
diff --git a/gigl/csrc/distributed/ppr_forward_push.pyi b/gigl/csrc/sampling/ppr_forward_push.pyi
similarity index 100%
rename from gigl/csrc/distributed/ppr_forward_push.pyi
rename to gigl/csrc/sampling/ppr_forward_push.pyi
diff --git a/gigl/csrc/distributed/python_ppr_forward_push.cpp b/gigl/csrc/sampling/python_ppr_forward_push.cpp
similarity index 100%
rename from gigl/csrc/distributed/python_ppr_forward_push.cpp
rename to gigl/csrc/sampling/python_ppr_forward_push.cpp
diff --git a/gigl/distributed/dist_ppr_sampler.py b/gigl/distributed/dist_ppr_sampler.py
index 6285d67e6..9e8c8a482 100644
--- a/gigl/distributed/dist_ppr_sampler.py
+++ b/gigl/distributed/dist_ppr_sampler.py
@@ -14,7 +14,7 @@
 from graphlearn_torch.typing import EdgeType, NodeType
 from graphlearn_torch.utils import merge_dict
 
-from gigl.csrc.distributed import PPRForwardPushState
+from gigl.csrc.sampling import PPRForwardPushState
 from gigl.distributed.dist_neighbor_sampler import DistNeighborSampler
 from gigl.types.graph import is_label_edge_type
 

From 6e63172a3fe10155358d790710df4e3be964161c Mon Sep 17 00:00:00 2001
From: mkolodner <mkolodner@snapchat.com>
Date: Wed, 1 Apr 2026 22:16:18 +0000
Subject: [PATCH 08/14] Update

---
 gigl/distributed/dist_ppr_sampler.py | 123 +++++++++++++++++++++------
 gigl/distributed/sampler_options.py  |   2 +-
 2 files changed, 96 insertions(+), 29 deletions(-)

diff --git a/gigl/distributed/dist_ppr_sampler.py b/gigl/distributed/dist_ppr_sampler.py
index 9e8c8a482..23c9d3c80 100644
--- a/gigl/distributed/dist_ppr_sampler.py
+++ b/gigl/distributed/dist_ppr_sampler.py
@@ -1,6 +1,5 @@
-# TODO (mkolodner-sc): Investigate whether concurrency for _sample_one_hop and _compute_ppr_scores will
-# yield performance benefits.
-
+import asyncio
+import time
 from collections import defaultdict
 from typing import Optional, Union
 
@@ -24,6 +23,9 @@
 # yield a bare EdgeType repr for ast.literal_eval).
 PPR_EDGE_INDEX_METADATA_KEY = "ppr_edge_index."
 PPR_WEIGHT_METADATA_KEY = "ppr_weight."
+PPR_FETCH_TIME_MS_METADATA_KEY = "ppr_fetch_time_ms"
+PPR_PUSH_TIME_MS_METADATA_KEY = "ppr_push_time_ms"
+PPR_ITERATIONS_METADATA_KEY = "ppr_iterations"
 
 # Sentinel type names for homogeneous graphs.  The PPR algorithm uses
 # dict[NodeType, ...] internally for both homo and hetero graphs; these
@@ -292,18 +294,29 @@ async def _batch_fetch_neighbors(
                 5: (tensor([7]),    tensor([0, 3]),        tensor([2])),
             }
         """
-        result: dict[int, tuple[torch.Tensor, torch.Tensor, torch.Tensor]] = {}
-        for eid, node_ids_tensor in nodes_by_etype_id.items():
-            etype = self._etype_id_to_etype[eid]
-            # _sample_one_hop expects None for homogeneous graphs, not the PPR sentinel.
-            output: NeighborOutput = await self._sample_one_hop(
-                srcs=node_ids_tensor.to(device),
-                num_nbr=self._num_neighbors_per_hop,
-                etype=etype if etype != _PPR_HOMOGENEOUS_EDGE_TYPE else None,
-            )
-            result[eid] = (node_ids_tensor, output.nbr, output.nbr_num)
-
-        return result
+        # Fire all per-edge-type RPC calls concurrently.  Each _sample_one_hop
+        # issues a single RPC round-trip; doing them in parallel rather than
+        # sequentially cuts fetch latency from O(num_edge_types) to O(1).
+        eids = list(nodes_by_etype_id.keys())
+        outputs: list[NeighborOutput] = await asyncio.gather(
+            *[
+                self._sample_one_hop(
+                    srcs=nodes_by_etype_id[eid].to(device),
+                    num_nbr=self._num_neighbors_per_hop,
+                    # _sample_one_hop expects None for homogeneous graphs, not the PPR sentinel.
+                    etype=(
+                        self._etype_id_to_etype[eid]
+                        if self._etype_id_to_etype[eid] != _PPR_HOMOGENEOUS_EDGE_TYPE
+                        else None
+                    ),
+                )
+                for eid in eids
+            ]
+        )
+        return {
+            eid: (nodes_by_etype_id[eid], output.nbr, output.nbr_num)
+            for eid, output in zip(eids, outputs)
+        }
 
     async def _compute_ppr_scores(
         self,
@@ -313,6 +326,7 @@ async def _compute_ppr_scores(
         Union[torch.Tensor, dict[NodeType, torch.Tensor]],
         Union[torch.Tensor, dict[NodeType, torch.Tensor]],
         Union[torch.Tensor, dict[NodeType, torch.Tensor]],
+        tuple[float, float, int],
     ]:
         """
         Compute PPR scores for seed nodes using the push-based approximation algorithm.
@@ -354,6 +368,9 @@ async def _compute_ppr_scores(
               seed, shape ``[batch_size]``.  Used to slice the flat tensors into
               per-seed groups: seed ``i``'s neighbors are at
               ``flat_neighbor_ids[sum(valid_counts[:i]) : sum(valid_counts[:i+1])]``.
+            - timing: ``(fetch_ms, push_ms, iterations)`` — wall-clock time
+              spent in neighbor fetch (ms), residual push (ms), and total loop
+              iteration count for this call.
 
         Example::
 
@@ -376,6 +393,10 @@ async def _compute_ppr_scores(
             self._degree_tensors_for_cpp,
         )
 
+        total_fetch_ms = 0.0
+        total_push_ms = 0.0
+        total_iterations = 0
+
         while True:
             # drain_queue returns None when the queue is truly empty (convergence),
             # or a dict (possibly empty) when nodes were drained.  An empty dict
@@ -388,13 +409,18 @@ async def _compute_ppr_scores(
 
             nodes_by_etype_id: dict[int, torch.Tensor] = drain_result
             if nodes_by_etype_id:
+                fetch_start = time.perf_counter()
                 fetched_by_etype_id = await self._batch_fetch_neighbors(
                     nodes_by_etype_id, device
                 )
+                total_fetch_ms += (time.perf_counter() - fetch_start) * 1000
             else:
                 fetched_by_etype_id = {}
 
+            push_start = time.perf_counter()
             ppr_state.push_residuals(fetched_by_etype_id)
+            total_push_ms += (time.perf_counter() - push_start) * 1000
+            total_iterations += 1
 
         # Translate ntype_id integer keys back to NodeType strings for the rest
         # of the pipeline, and move tensors to the correct device.
@@ -410,6 +436,7 @@ async def _compute_ppr_scores(
             ntype_to_flat_weights[ntype] = flat_weights.to(device)
             ntype_to_valid_counts[ntype] = valid_counts.to(device)
 
+        timing = (total_fetch_ms, total_push_ms, total_iterations)
         if self._is_homogeneous:
             assert (
                 len(ntype_to_flat_ids) == 1
@@ -419,9 +446,15 @@ async def _compute_ppr_scores(
                 ntype_to_flat_ids[_PPR_HOMOGENEOUS_NODE_TYPE],
                 ntype_to_flat_weights[_PPR_HOMOGENEOUS_NODE_TYPE],
                 ntype_to_valid_counts[_PPR_HOMOGENEOUS_NODE_TYPE],
+                timing,
             )
         else:
-            return ntype_to_flat_ids, ntype_to_flat_weights, ntype_to_valid_counts
+            return (
+                ntype_to_flat_ids,
+                ntype_to_flat_weights,
+                ntype_to_valid_counts,
+                timing,
+            )
 
     async def _sample_from_nodes(
         self,
@@ -508,20 +541,42 @@ async def _sample_from_nodes(
             # NodeType -> global IDs (same values as nodes_to_sample).
             src_dict = inducer.init_node(nodes_to_sample)
 
-            # Compute PPR for each seed type, collecting flat global neighbor IDs,
-            # weights, and per-seed counts.  Build nbr_dict for a single
-            # inducer.induce_next call using PPR edge types (seed_type, 'ppr', ntype)
-            # — the inducer only cares about etype[0] and etype[-1] as source/dest
-            # node types, so the relation name is arbitrary.
+            # Compute PPR for all seed types concurrently, collecting flat global
+            # neighbor IDs, weights, and per-seed counts.  Build nbr_dict for a
+            # single inducer.induce_next call using PPR edge types
+            # (seed_type, 'ppr', ntype) — the inducer only cares about etype[0]
+            # and etype[-1] as source/dest node types, so the relation name is
+            # arbitrary.
+            #
+            # Each seed type's PPR computation is entirely independent: it creates
+            # its own PPRForwardPushState and only reads shared sampler attributes
+            # (degree tensors, edge-type maps) which are immutable after __init__.
+            # Running them with asyncio.gather allows their fetch phases to overlap,
+            # which is most beneficial when there are 2+ distinct seed node types
+            # (e.g. cross-type supervision edges like user→story).
+            seed_types = list(nodes_to_sample.keys())
+            ppr_results = await asyncio.gather(
+                *[
+                    self._compute_ppr_scores(nodes_to_sample[seed_type], seed_type)
+                    for seed_type in seed_types
+                ]
+            )
+
             nbr_dict: dict[EdgeType, list[torch.Tensor]] = {}
             ppr_edge_type_to_flat_weights: dict[EdgeType, torch.Tensor] = {}
-
-            for seed_type, seed_nodes in nodes_to_sample.items():
-                (
-                    ntype_to_flat_ids,
-                    ntype_to_flat_weights,
-                    ntype_to_valid_counts,
-                ) = await self._compute_ppr_scores(seed_nodes, seed_type)
+            total_fetch_ms = 0.0
+            total_push_ms = 0.0
+            total_iterations = 0
+
+            for seed_type, (
+                ntype_to_flat_ids,
+                ntype_to_flat_weights,
+                ntype_to_valid_counts,
+                (fetch_ms, push_ms, iterations),
+            ) in zip(seed_types, ppr_results):
+                total_fetch_ms += fetch_ms
+                total_push_ms += push_ms
+                total_iterations += iterations
                 assert isinstance(ntype_to_flat_ids, dict)
                 assert isinstance(ntype_to_flat_weights, dict)
                 assert isinstance(ntype_to_valid_counts, dict)
@@ -583,6 +638,12 @@ async def _sample_from_nodes(
                 metadata[f"{PPR_EDGE_INDEX_METADATA_KEY}{etype_str}"] = edge_index
                 metadata[f"{PPR_WEIGHT_METADATA_KEY}{etype_str}"] = flat_weights
 
+            metadata[PPR_FETCH_TIME_MS_METADATA_KEY] = torch.tensor(total_fetch_ms)
+            metadata[PPR_PUSH_TIME_MS_METADATA_KEY] = torch.tensor(total_push_ms)
+            metadata[PPR_ITERATIONS_METADATA_KEY] = torch.tensor(
+                total_iterations, dtype=torch.long
+            )
+
             sample_output = HeteroSamplerOutput(
                 node=node_dict,
                 # row/col/edge are left empty rather than populated with PPR edges because
@@ -613,6 +674,7 @@ async def _sample_from_nodes(
                 homo_flat_ids,
                 homo_flat_weights,
                 homo_valid_counts,
+                (total_fetch_ms, total_push_ms, total_iterations),
             ) = await self._compute_ppr_scores(nodes_to_sample, None)
             assert isinstance(homo_flat_ids, torch.Tensor)
             assert isinstance(homo_flat_weights, torch.Tensor)
@@ -634,6 +696,11 @@ async def _sample_from_nodes(
 
             metadata["edge_index"] = ppr_edge_index
             metadata["edge_attr"] = homo_flat_weights
+            metadata[PPR_FETCH_TIME_MS_METADATA_KEY] = torch.tensor(total_fetch_ms)
+            metadata[PPR_PUSH_TIME_MS_METADATA_KEY] = torch.tensor(total_push_ms)
+            metadata[PPR_ITERATIONS_METADATA_KEY] = torch.tensor(
+                total_iterations, dtype=torch.long
+            )
 
             sample_output = SamplerOutput(
                 node=all_nodes,
diff --git a/gigl/distributed/sampler_options.py b/gigl/distributed/sampler_options.py
index d87a83d52..eccdd70f7 100644
--- a/gigl/distributed/sampler_options.py
+++ b/gigl/distributed/sampler_options.py
@@ -64,7 +64,7 @@ class PPRSamplerOptions:
     alpha: float = 0.5
     eps: float = 1e-4
     max_ppr_nodes: int = 50
-    num_neighbors_per_hop: int = 100_000
+    num_neighbors_per_hop: int = 1_000
     total_degree_dtype: torch.dtype = torch.int32
 
 

From c16dd9d38300d30a542407d659df236644ac2d24 Mon Sep 17 00:00:00 2001
From: mkolodner <mkolodner@snapchat.com>
Date: Wed, 1 Apr 2026 23:30:23 +0000
Subject: [PATCH 09/14] Fix type check and remove unused etypes from
 num_sampled_edges

---
 gigl/distributed/utils/neighborloader.py | 1 +
 scripts/build_cpp_extensions.py          | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/gigl/distributed/utils/neighborloader.py b/gigl/distributed/utils/neighborloader.py
index 974d12cf6..d0d96638f 100644
--- a/gigl/distributed/utils/neighborloader.py
+++ b/gigl/distributed/utils/neighborloader.py
@@ -189,6 +189,7 @@ def strip_non_ppr_edge_types(
     for edge_type in list(data.edge_types):
         if edge_type not in ppr_edge_types:
             del data[edge_type]
+            data.num_sampled_edges.pop(edge_type, None)
     return data
 
 
diff --git a/scripts/build_cpp_extensions.py b/scripts/build_cpp_extensions.py
index d05740860..f99e47f9b 100644
--- a/scripts/build_cpp_extensions.py
+++ b/scripts/build_cpp_extensions.py
@@ -10,13 +10,13 @@
 
 from pathlib import Path
 
-from setuptools import setup
+from setuptools import Extension, setup
 from torch.utils.cpp_extension import BuildExtension, CppExtension
 
 _CSRC_DIR = Path("gigl/csrc")
 
 
-def find_cpp_extensions() -> list[CppExtension]:
+def find_cpp_extensions() -> list[Extension]:
     """Auto-discover pybind11 extension modules under ``gigl/csrc/``.
 
     Following PyTorch's csrc convention, only files named ``python_*.cpp`` are

From d651f41f6f05028c5294ffdbb9d0d548056d3095 Mon Sep 17 00:00:00 2001
From: mkolodner <mkolodner@snapchat.com>
Date: Mon, 6 Apr 2026 23:29:52 +0000
Subject: [PATCH 10/14] fetched-count normalization, nodes_drained diagnostic,
 max_fetch_iterations, per-iteration timing metadata

---
 gigl/csrc/sampling/ppr_forward_push.cpp       |  38 ++++-
 gigl/csrc/sampling/ppr_forward_push.h         |  11 ++
 gigl/csrc/sampling/ppr_forward_push.pyi       |   1 +
 .../csrc/sampling/python_ppr_forward_push.cpp |   3 +-
 gigl/distributed/dist_ppr_sampler.py          | 134 ++++++++++++++++--
 gigl/distributed/dist_sampling_producer.py    |   1 +
 gigl/distributed/sampler_options.py           |   7 +
 7 files changed, 175 insertions(+), 20 deletions(-)

diff --git a/gigl/csrc/sampling/ppr_forward_push.cpp b/gigl/csrc/sampling/ppr_forward_push.cpp
index a514907ab..1bbf05dbe 100644
--- a/gigl/csrc/sampling/ppr_forward_push.cpp
+++ b/gigl/csrc/sampling/ppr_forward_push.cpp
@@ -57,6 +57,7 @@ std::optional<std::unordered_map<int32_t, torch::Tensor>> PPRForwardPushState::d
     // in multiple seeds' queues: we only fetch each (node, etype) pair once.
     std::unordered_map<int32_t, std::unordered_set<int32_t>> nodes_to_lookup;
 
+    int32_t total_drained_this_round = 0;
     for (int32_t s = 0; s < batch_size_; ++s) {
         for (int32_t nt = 0; nt < num_node_types_; ++nt) {
             if (queue_[s][nt].empty())
@@ -65,6 +66,7 @@ std::optional<std::unordered_map<int32_t, torch::Tensor>> PPRForwardPushState::d
             // Move the live queue into the snapshot (no data copy — O(1)).
             queued_nodes_[s][nt] = std::move(queue_[s][nt]);
             queue_[s][nt].clear();
+            total_drained_this_round += static_cast<int32_t>(queued_nodes_[s][nt].size());
             num_nodes_in_queue_ -= static_cast<int32_t>(queued_nodes_[s][nt].size());
 
             for (int32_t node_id : queued_nodes_[s][nt]) {
@@ -77,6 +79,8 @@ std::optional<std::unordered_map<int32_t, torch::Tensor>> PPRForwardPushState::d
         }
     }
 
+    nodes_drained_per_iteration_.push_back(total_drained_this_round);
+
     std::unordered_map<int32_t, torch::Tensor> result;
     for (auto& [eid, node_set] : nodes_to_lookup) {
         std::vector<int64_t> ids(node_set.begin(), node_set.end());
@@ -85,6 +89,10 @@ std::optional<std::unordered_map<int32_t, torch::Tensor>> PPRForwardPushState::d
     return result;
 }
 
+const std::vector<int32_t>& PPRForwardPushState::get_nodes_drained_per_iteration() const {
+    return nodes_drained_per_iteration_;
+}
+
 void PPRForwardPushState::push_residuals(
     const std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>&
         fetched_by_etype_id) {
@@ -135,13 +143,28 @@ void PPRForwardPushState::push_residuals(
                 ppr_scores_[s][nt][src] += res;
                 src_res[src] = 0.0;
 
-                int32_t total_deg = get_total_degree(src, nt);
-                // Destination-only nodes absorb residual but do not push further.
-                if (total_deg == 0)
+                // b. Count total fetched/cached neighbors across all edge types for
+                // this source node.  We normalise by the number of neighbors we
+                // actually retrieved, not the true degree, so residual is fully
+                // distributed among known neighbors rather than leaking to unfetched
+                // ones (which matters when num_neighbors_per_hop < true_degree).
+                int32_t total_fetched = 0;
+                for (int32_t eid : node_type_to_edge_type_ids_[nt]) {
+                    auto fi = fetched.find(pack_key(src, eid));
+                    if (fi != fetched.end()) {
+                        total_fetched += static_cast<int32_t>(fi->second.size());
+                    } else {
+                        auto ci = neighbor_cache_.find(pack_key(src, eid));
+                        if (ci != neighbor_cache_.end())
+                            total_fetched += static_cast<int32_t>(ci->second.size());
+                    }
+                }
+                // Destination-only nodes (or nodes with no fetched neighbors) absorb
+                // residual but do not push further.
+                if (total_fetched == 0)
                     continue;
 
-                // b. Distribute: each neighbor receives an equal share.
-                double res_per_nbr = one_minus_alpha_ * res / static_cast<double>(total_deg);
+                double res_per_nbr = one_minus_alpha_ * res / static_cast<double>(total_fetched);
 
                 for (int32_t eid : node_type_to_edge_type_ids_[nt]) {
                     // Invariant: fetched and neighbor_cache_ are mutually exclusive for
@@ -167,8 +190,9 @@ void PPRForwardPushState::push_residuals(
                     for (int32_t nbr : *nbr_list) {
                         residuals_[s][dst_nt][nbr] += res_per_nbr;
 
-                        double threshold = requeue_threshold_factor_ *
-                                           static_cast<double>(get_total_degree(nbr, dst_nt));
+                        double threshold =
+                            requeue_threshold_factor_ *
+                            static_cast<double>(get_total_degree(nbr, dst_nt));
 
                         if (queue_[s][dst_nt].find(nbr) == queue_[s][dst_nt].end() &&
                             residuals_[s][dst_nt][nbr] >= threshold) {
diff --git a/gigl/csrc/sampling/ppr_forward_push.h b/gigl/csrc/sampling/ppr_forward_push.h
index 7f0c92f49..82973ff7a 100644
--- a/gigl/csrc/sampling/ppr_forward_push.h
+++ b/gigl/csrc/sampling/ppr_forward_push.h
@@ -118,4 +118,15 @@ class PPRForwardPushState {
     // Neighbor cache
     // -------------------------------------------------------------------------
     std::unordered_map<uint64_t, std::vector<int32_t>> neighbor_cache_;
+
+    // -------------------------------------------------------------------------
+    // Diagnostics (populated during the algorithm; read after convergence)
+    // -------------------------------------------------------------------------
+    // Total nodes drained (across all seeds and node types) in each drain_queue()
+    // call.  One entry per loop iteration; useful for understanding convergence speed.
+    std::vector<int32_t> nodes_drained_per_iteration_;
+
+   public:
+    // Returns nodes_drained_per_iteration_ built up across all drain_queue() calls.
+    const std::vector<int32_t>& get_nodes_drained_per_iteration() const;
 };
diff --git a/gigl/csrc/sampling/ppr_forward_push.pyi b/gigl/csrc/sampling/ppr_forward_push.pyi
index 265468c3c..9a3c78fea 100644
--- a/gigl/csrc/sampling/ppr_forward_push.pyi
+++ b/gigl/csrc/sampling/ppr_forward_push.pyi
@@ -19,3 +19,4 @@ class PPRForwardPushState:
     def extract_top_k(
         self, max_ppr_nodes: int
     ) -> dict[int, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]: ...
+    def get_nodes_drained_per_iteration(self) -> list[int]: ...
diff --git a/gigl/csrc/sampling/python_ppr_forward_push.cpp b/gigl/csrc/sampling/python_ppr_forward_push.cpp
index ebf3fa27a..4a296abf8 100644
--- a/gigl/csrc/sampling/python_ppr_forward_push.cpp
+++ b/gigl/csrc/sampling/python_ppr_forward_push.cpp
@@ -59,5 +59,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
                       std::vector<torch::Tensor>>())
         .def("drain_queue", drain_queue_wrapper)
         .def("push_residuals", push_residuals_wrapper)
-        .def("extract_top_k", extract_top_k_wrapper);
+        .def("extract_top_k", extract_top_k_wrapper)
+        .def("get_nodes_drained_per_iteration", &PPRForwardPushState::get_nodes_drained_per_iteration);
 }
diff --git a/gigl/distributed/dist_ppr_sampler.py b/gigl/distributed/dist_ppr_sampler.py
index 891b70772..b27786a31 100644
--- a/gigl/distributed/dist_ppr_sampler.py
+++ b/gigl/distributed/dist_ppr_sampler.py
@@ -13,10 +13,13 @@
 from graphlearn_torch.typing import EdgeType, NodeType
 from graphlearn_torch.utils import merge_dict
 
+from gigl.common.logger import Logger
 from gigl.csrc.sampling import PPRForwardPushState
 from gigl.distributed.base_sampler import BaseDistNeighborSampler
 from gigl.types.graph import is_label_edge_type
 
+_logger = Logger()
+
 # Trailing "." is an intentional separator.  These constants are used both to
 # write metadata keys (f"{KEY}{repr(edge_type)}" → e.g. "ppr_edge_index.('user', 'to', 'story')")
 # and as the strip prefix in extract_edge_type_metadata (key[len(prefix):] must
@@ -26,6 +29,10 @@
 PPR_FETCH_TIME_MS_METADATA_KEY = "ppr_fetch_time_ms"
 PPR_PUSH_TIME_MS_METADATA_KEY = "ppr_push_time_ms"
 PPR_ITERATIONS_METADATA_KEY = "ppr_iterations"
+PPR_NODES_PER_ITERATION_METADATA_KEY = "ppr_nodes_per_iteration"
+PPR_FETCH_TIME_PER_ITER_MS_METADATA_KEY = "ppr_fetch_time_per_iter_ms"
+PPR_PUSH_TIME_PER_ITER_MS_METADATA_KEY = "ppr_push_time_per_iter_ms"
+PPR_NODES_NEEDING_FETCH_PER_ITER_METADATA_KEY = "ppr_nodes_needing_fetch_per_iter"
 
 # Sentinel type names for homogeneous graphs.  The PPR algorithm uses
 # dict[NodeType, ...] internally for both homo and hetero graphs; these
@@ -80,9 +87,10 @@ class DistPPRNeighborSampler(BaseDistNeighborSampler):
              but require more computation. Typical values: 1e-4 to 1e-6.
         max_ppr_nodes: Maximum number of nodes to return per seed based on PPR scores.
         num_neighbors_per_hop: Maximum number of neighbors to fetch per hop.
-        total_degree_dtype: Dtype for precomputed total-degree tensors. Defaults to
-            ``torch.int32``, which supports total degrees up to ~2 billion. Use a
-            larger dtype if nodes have exceptionally high aggregate degrees.
+        total_degree_dtype: Dtype for precomputed total-degree tensors. Defaults
+            to ``torch.int32``. Use a larger dtype if nodes have exceptionally high
+            aggregate degrees.
+        degree_tensors: Pre-computed degree tensors from the dataset.
     """
 
     def __init__(
@@ -94,6 +102,7 @@ def __init__(
         num_neighbors_per_hop: int = 100_000,
         total_degree_dtype: torch.dtype = torch.int32,
         degree_tensors: Union[torch.Tensor, dict[EdgeType, torch.Tensor]],
+        max_fetch_iterations: int = 0,
         **kwargs,
     ):
         super().__init__(*args, **kwargs)
@@ -102,6 +111,7 @@ def __init__(
         self._max_ppr_nodes = max_ppr_nodes
         self._requeue_threshold_factor = alpha * eps
         self._num_neighbors_per_hop = num_neighbors_per_hop
+        self._max_fetch_iterations = max_fetch_iterations
 
         # Build mapping from node type to edge types that can be traversed from that node type.
         self._node_type_to_edge_types: dict[NodeType, list[EdgeType]] = defaultdict(
@@ -321,7 +331,7 @@ async def _compute_ppr_scores(
         Union[torch.Tensor, dict[NodeType, torch.Tensor]],
         Union[torch.Tensor, dict[NodeType, torch.Tensor]],
         Union[torch.Tensor, dict[NodeType, torch.Tensor]],
-        tuple[float, float, int],
+        tuple[float, float, int, list[int], list[float], list[float], list[int]],
     ]:
         """
         Compute PPR scores for seed nodes using the push-based approximation algorithm.
@@ -391,6 +401,10 @@ async def _compute_ppr_scores(
         total_fetch_ms = 0.0
         total_push_ms = 0.0
         total_iterations = 0
+        fetch_iteration_count = 0
+        fetch_ms_per_iter: list[float] = []
+        push_ms_per_iter: list[float] = []
+        nodes_needing_fetch_per_iter: list[int] = []
 
         while True:
             # drain_queue returns None when the queue is truly empty (convergence),
@@ -398,23 +412,41 @@ async def _compute_ppr_scores(
             # means all drained nodes either had cached neighbors or no outgoing
             # edges — we still call push_residuals to flush their residuals into
             # ppr_scores_.
-            drain_result: dict[int, torch.Tensor] | None = ppr_state.drain_queue()
+            drain_result: Optional[dict[int, torch.Tensor]] = ppr_state.drain_queue()
             if drain_result is None:
                 break
 
             nodes_by_etype_id: dict[int, torch.Tensor] = drain_result
-            if nodes_by_etype_id:
+            fetch_budget_remaining = (
+                self._max_fetch_iterations == 0
+                or fetch_iteration_count < self._max_fetch_iterations
+            )
+            if nodes_by_etype_id and fetch_budget_remaining:
+                # Total (node, edge_type) pairs needing RPCs this iteration.
+                # A node with uncached neighbors for k edge types contributes k here.
+                nodes_needing_fetch = sum(t.numel() for t in nodes_by_etype_id.values())
                 fetch_start = time.perf_counter()
                 fetched_by_etype_id = await self._batch_fetch_neighbors(
                     nodes_by_etype_id, device
                 )
-                total_fetch_ms += (time.perf_counter() - fetch_start) * 1000
+                iter_fetch_ms = (time.perf_counter() - fetch_start) * 1000
+                total_fetch_ms += iter_fetch_ms
+                fetch_iteration_count += 1
             else:
+                # Either all nodes are cached, or the fetch budget is exhausted.
+                # push_residuals will propagate using the existing neighbor cache.
+                nodes_needing_fetch = 0
                 fetched_by_etype_id = {}
+                iter_fetch_ms = 0.0
+
+            nodes_needing_fetch_per_iter.append(nodes_needing_fetch)
+            fetch_ms_per_iter.append(iter_fetch_ms)
 
             push_start = time.perf_counter()
             ppr_state.push_residuals(fetched_by_etype_id)
-            total_push_ms += (time.perf_counter() - push_start) * 1000
+            iter_push_ms = (time.perf_counter() - push_start) * 1000
+            total_push_ms += iter_push_ms
+            push_ms_per_iter.append(iter_push_ms)
             total_iterations += 1
 
         # Translate ntype_id integer keys back to NodeType strings for the rest
@@ -431,7 +463,18 @@ async def _compute_ppr_scores(
             ntype_to_flat_weights[ntype] = flat_weights.to(device)
             ntype_to_valid_counts[ntype] = valid_counts.to(device)
 
-        timing = (total_fetch_ms, total_push_ms, total_iterations)
+        nodes_drained_per_iteration: list[
+            int
+        ] = ppr_state.get_nodes_drained_per_iteration()
+        timing = (
+            total_fetch_ms,
+            total_push_ms,
+            total_iterations,
+            nodes_drained_per_iteration,
+            fetch_ms_per_iter,
+            push_ms_per_iter,
+            nodes_needing_fetch_per_iter,
+        )
         if self._is_homogeneous:
             assert (
                 len(ntype_to_flat_ids) == 1
@@ -562,16 +605,48 @@ async def _sample_from_nodes(
             total_fetch_ms = 0.0
             total_push_ms = 0.0
             total_iterations = 0
+            total_nodes_per_iteration: list[int] = []
+            total_fetch_ms_per_iter: list[float] = []
+            total_push_ms_per_iter: list[float] = []
+            total_nodes_needing_fetch_per_iter: list[int] = []
 
             for seed_type, (
                 ntype_to_flat_ids,
                 ntype_to_flat_weights,
                 ntype_to_valid_counts,
-                (fetch_ms, push_ms, iterations),
+                (
+                    fetch_ms,
+                    push_ms,
+                    iterations,
+                    nodes_per_iter,
+                    fetch_ms_per_iter,
+                    push_ms_per_iter,
+                    nodes_needing_fetch_per_iter,
+                ),
             ) in zip(seed_types, ppr_results):
                 total_fetch_ms += fetch_ms
                 total_push_ms += push_ms
                 total_iterations += iterations
+                for i, count in enumerate(nodes_per_iter):
+                    if i < len(total_nodes_per_iteration):
+                        total_nodes_per_iteration[i] += count
+                    else:
+                        total_nodes_per_iteration.append(count)
+                for i, val in enumerate(fetch_ms_per_iter):
+                    if i < len(total_fetch_ms_per_iter):
+                        total_fetch_ms_per_iter[i] += val
+                    else:
+                        total_fetch_ms_per_iter.append(val)
+                for i, val in enumerate(push_ms_per_iter):
+                    if i < len(total_push_ms_per_iter):
+                        total_push_ms_per_iter[i] += val
+                    else:
+                        total_push_ms_per_iter.append(val)
+                for i, val in enumerate(nodes_needing_fetch_per_iter):
+                    if i < len(total_nodes_needing_fetch_per_iter):
+                        total_nodes_needing_fetch_per_iter[i] += val
+                    else:
+                        total_nodes_needing_fetch_per_iter.append(val)
                 assert isinstance(ntype_to_flat_ids, dict)
                 assert isinstance(ntype_to_flat_weights, dict)
                 assert isinstance(ntype_to_valid_counts, dict)
@@ -621,7 +696,10 @@ async def _sample_from_nodes(
             # rows_dict and cols_dict are keyed by PPR edge type and give
             # flat local source/destination indices respectively, aligned with
             # the flat_ids order passed to induce_next.
-            for ppr_edge_type, flat_weights in ppr_edge_type_to_flat_weights.items():
+            for (
+                ppr_edge_type,
+                flat_weights,
+            ) in ppr_edge_type_to_flat_weights.items():
                 rows = rows_dict.get(ppr_edge_type)
                 cols = cols_dict.get(ppr_edge_type)
                 if rows is not None and cols is not None:
@@ -638,6 +716,18 @@ async def _sample_from_nodes(
             metadata[PPR_ITERATIONS_METADATA_KEY] = torch.tensor(
                 total_iterations, dtype=torch.long
             )
+            metadata[PPR_NODES_PER_ITERATION_METADATA_KEY] = torch.tensor(
+                total_nodes_per_iteration, dtype=torch.long
+            )
+            metadata[PPR_FETCH_TIME_PER_ITER_MS_METADATA_KEY] = torch.tensor(
+                total_fetch_ms_per_iter, dtype=torch.float
+            )
+            metadata[PPR_PUSH_TIME_PER_ITER_MS_METADATA_KEY] = torch.tensor(
+                total_push_ms_per_iter, dtype=torch.float
+            )
+            metadata[PPR_NODES_NEEDING_FETCH_PER_ITER_METADATA_KEY] = torch.tensor(
+                total_nodes_needing_fetch_per_iter, dtype=torch.long
+            )
 
             sample_output = HeteroSamplerOutput(
                 node=node_dict,
@@ -669,7 +759,15 @@ async def _sample_from_nodes(
                 homo_flat_ids,
                 homo_flat_weights,
                 homo_valid_counts,
-                (total_fetch_ms, total_push_ms, total_iterations),
+                (
+                    total_fetch_ms,
+                    total_push_ms,
+                    total_iterations,
+                    total_nodes_per_iteration,
+                    total_fetch_ms_per_iter,
+                    total_push_ms_per_iter,
+                    total_nodes_needing_fetch_per_iter,
+                ),
             ) = await self._compute_ppr_scores(nodes_to_sample, None)
             assert isinstance(homo_flat_ids, torch.Tensor)
             assert isinstance(homo_flat_weights, torch.Tensor)
@@ -696,6 +794,18 @@ async def _sample_from_nodes(
             metadata[PPR_ITERATIONS_METADATA_KEY] = torch.tensor(
                 total_iterations, dtype=torch.long
             )
+            metadata[PPR_NODES_PER_ITERATION_METADATA_KEY] = torch.tensor(
+                total_nodes_per_iteration, dtype=torch.long
+            )
+            metadata[PPR_FETCH_TIME_PER_ITER_MS_METADATA_KEY] = torch.tensor(
+                total_fetch_ms_per_iter, dtype=torch.float
+            )
+            metadata[PPR_PUSH_TIME_PER_ITER_MS_METADATA_KEY] = torch.tensor(
+                total_push_ms_per_iter, dtype=torch.float
+            )
+            metadata[PPR_NODES_NEEDING_FETCH_PER_ITER_METADATA_KEY] = torch.tensor(
+                total_nodes_needing_fetch_per_iter, dtype=torch.long
+            )
 
             sample_output = SamplerOutput(
                 node=all_nodes,
diff --git a/gigl/distributed/dist_sampling_producer.py b/gigl/distributed/dist_sampling_producer.py
index f155bd929..490514c6c 100644
--- a/gigl/distributed/dist_sampling_producer.py
+++ b/gigl/distributed/dist_sampling_producer.py
@@ -131,6 +131,7 @@ def _sampling_worker_loop(
                 num_neighbors_per_hop=sampler_options.num_neighbors_per_hop,
                 total_degree_dtype=sampler_options.total_degree_dtype,
                 degree_tensors=degree_tensors,
+                max_fetch_iterations=sampler_options.max_fetch_iterations,
             )
         else:
             raise NotImplementedError(
diff --git a/gigl/distributed/sampler_options.py b/gigl/distributed/sampler_options.py
index 639a932a5..d72c9092e 100644
--- a/gigl/distributed/sampler_options.py
+++ b/gigl/distributed/sampler_options.py
@@ -59,6 +59,12 @@ class PPRSamplerOptions:
         total_degree_dtype: Dtype for precomputed total-degree tensors. Defaults
             to ``torch.int32``, which supports total degrees up to ~2 billion.
             Use a larger dtype if nodes have exceptionally high aggregate degrees.
+        max_fetch_iterations: Maximum number of iterations that issue RPC neighbor
+            fetches. After this many fetch iterations, subsequent iterations push
+            residuals using only already-cached neighbor lists (no new RPCs).
+            The algorithm still runs to convergence — re-enqueued nodes propagate
+            through cached neighbors at negligible cost. Set to 0 (default) for
+            no fetch limit.
     """
 
     alpha: float = 0.5
@@ -66,6 +72,7 @@ class PPRSamplerOptions:
     max_ppr_nodes: int = 50
     num_neighbors_per_hop: int = 1_000
     total_degree_dtype: torch.dtype = torch.int32
+    max_fetch_iterations: int = 0
 
 
 SamplerOptions = Union[KHopNeighborSamplerOptions, PPRSamplerOptions]

From 35e52bcda2527b34bb7ffb885e13585a6193ec8d Mon Sep 17 00:00:00 2001
From: mkolodner <mkolodner@snapchat.com>
Date: Fri, 10 Apr 2026 18:00:52 +0000
Subject: [PATCH 11/14] Add event loop threading

---
 gigl/csrc/sampling/python_ppr_forward_push.cpp | 9 ++++++++-
 gigl/distributed/dist_ppr_sampler.py           | 4 +++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/gigl/csrc/sampling/python_ppr_forward_push.cpp b/gigl/csrc/sampling/python_ppr_forward_push.cpp
index 4a296abf8..78de24cbc 100644
--- a/gigl/csrc/sampling/python_ppr_forward_push.cpp
+++ b/gigl/csrc/sampling/python_ppr_forward_push.cpp
@@ -29,13 +29,20 @@ static py::object drain_queue_wrapper(PPRForwardPushState& self) {
 // Convert to C++ map before delegating.
 static void push_residuals_wrapper(PPRForwardPushState& self, py::dict fetched_by_etype_id) {
     std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> cpp_map;
+    // Dict iteration touches Python objects — GIL must be held here.
     for (auto item : fetched_by_etype_id) {
         int32_t eid = item.first.cast<int32_t>();
         auto tup = item.second.cast<py::tuple>();
         cpp_map[eid] = {tup[0].cast<torch::Tensor>(), tup[1].cast<torch::Tensor>(),
                         tup[2].cast<torch::Tensor>()};
     }
-    self.push_residuals(cpp_map);
+    // C++ push only uses tensor accessor/data_ptr APIs — GIL-safe to release.
+    // Releasing here lets the asyncio event loop process RPC completion callbacks
+    // from other concurrent PPR coroutines while this push runs.
+    {
+        py::gil_scoped_release release;
+        self.push_residuals(cpp_map);
+    }
 }
 
 // extract_top_k: C++ returns map<ntype_id, tuple<Tensor, Tensor, Tensor>>.
diff --git a/gigl/distributed/dist_ppr_sampler.py b/gigl/distributed/dist_ppr_sampler.py
index b27786a31..dfb9220d2 100644
--- a/gigl/distributed/dist_ppr_sampler.py
+++ b/gigl/distributed/dist_ppr_sampler.py
@@ -443,7 +443,9 @@ async def _compute_ppr_scores(
             fetch_ms_per_iter.append(iter_fetch_ms)
 
             push_start = time.perf_counter()
-            ppr_state.push_residuals(fetched_by_etype_id)
+            await asyncio.get_running_loop().run_in_executor(
+                None, ppr_state.push_residuals, fetched_by_etype_id
+            )
             iter_push_ms = (time.perf_counter() - push_start) * 1000
             total_push_ms += iter_push_ms
             push_ms_per_iter.append(iter_push_ms)

From 4a3beac2b24b062842eb520900c8592605a5402b Mon Sep 17 00:00:00 2001
From: mkolodner <mkolodner@snapchat.com>
Date: Fri, 10 Apr 2026 21:01:20 +0000
Subject: [PATCH 12/14] Update guidance

---
 .clang-tidy             |  8 +++++++-
 docs/cpp_style_guide.md | 17 +++++++----------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/.clang-tidy b/.clang-tidy
index 279b7f030..3198f4627 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -40,6 +40,12 @@ WarningsAsErrors: '*'
 HeaderFilterRegex: '.*'
 FormatStyle:     none
 User:            jenkins
+# CheckOptions: per-check tuning parameters. Each entry configures a specific
+# option for an individual check, using the form:
+#   key:   <check-name>.<OptionName>
+#   value: <value>
+# These let you adjust thresholds, naming patterns, and behavior without
+# enabling or disabling the check entirely.
 CheckOptions:
   - key:             bugprone-argument-comment.StrictMode
     value:           '0'
@@ -142,7 +148,7 @@ CheckOptions:
   - key:             performance-type-promotion-in-math-fn.IncludeStyle
     value:           llvm
   - key:             readability-braces-around-statements.ShortStatementLines
-    value:           '2'
+    value:           '0'
   - key:             readability-function-size.BranchThreshold
     value:           '4294967295'
   - key:             readability-function-size.LineThreshold
diff --git a/docs/cpp_style_guide.md b/docs/cpp_style_guide.md
index 4aa0c30c1..8fc84296e 100644
--- a/docs/cpp_style_guide.md
+++ b/docs/cpp_style_guide.md
@@ -5,16 +5,13 @@ GiGL enforces C++ style automatically via two tools:
 - **clang-format** (`.clang-format`) — code formatting
 - **clang-tidy** (`.clang-tidy`) — static analysis and lint
 
-Both run as part of CI. All clang-tidy warnings are treated as errors.
+All clang-tidy warnings are treated as errors.
 
 ## Running the Tools
 
 ```bash
-# Format all C++ files in-place
-clang-format -i $(find gigl/csrc -name '*.cpp' -o -name '*.h')
-
-# Run static analysis
-clang-tidy gigl/csrc/**/*.cpp
+make format_cpp  # Format all C++ files in-place
+make lint_cpp    # Run clang-tidy static analysis
 ```
 
 ______________________________________________________________________
@@ -99,11 +96,11 @@ Enabled families:
 | -------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
 | `boost-use-to-string`            | Prefer `std::to_string` over `boost::lexical_cast` for numeric conversions                                                             |
 | `bugprone-*`                     | Common programming mistakes: dangling handles, suspicious string construction, assert side effects, etc.                               |
-| `cert-*` (selected)              | CERT secure coding rules for error handling (`err34-c`), floating-point loops (`flp30-c`), and RNG seeding (`msc32-c`, `msc50/51-cpp`) |
+| `cert-*`              | CERT secure coding rules for error handling (`err34-c`), floating-point loops (`flp30-c`), and RNG seeding (`msc32-c`, `msc50/51-cpp`) |
 | `clang-analyzer-*`               | Clang static analyzer: memory safety, null dereferences, use-after-free, etc.                                                          |
 | `clang-diagnostic-*`             | Compiler diagnostic warnings surfaced as lint checks                                                                                   |
-| `cppcoreguidelines-*` (selected) | C++ Core Guidelines: no raw `malloc`, no union member access, no object slicing, safe downcasts                                        |
-| `google-*` (selected)            | Google C++ style: explicit constructors, no global names in headers, safe `memset` usage                                               |
+| `cppcoreguidelines-*` | C++ Core Guidelines: no raw `malloc`, no union member access, no object slicing, safe downcasts                                        |
+| `google-*`            | Google C++ style: explicit constructors, no global names in headers, safe `memset` usage                                               |
 | `hicpp-exception-baseclass`      | All thrown exceptions must derive from `std::exception`                                                                                |
 | `misc-*`                         | Miscellaneous: header-only definitions, suspicious enum usage, throw-by-value/catch-by-reference, etc.                                 |
 | `modernize-*`                    | Modernize to C++11/14/17: `nullptr`, range-based for, `make_unique`, `using` aliases, etc.                                             |
@@ -150,4 +147,4 @@ Enforced via `readability-identifier-naming`:
 | `bugprone-string-constructor.LargeLengthThreshold`         | `8388608` (8 MB) | Strings larger than 8 MB from a length argument are flagged    |
 | `modernize-loop-convert.NamingStyle`                       | `CamelCase`      | Auto-generated loop variable names use CamelCase               |
 | `readability-function-size.LineThreshold`                  | `1000`           | Functions over 1000 lines are flagged                          |
-| `readability-braces-around-statements.ShortStatementLines` | `2`              | Single-line bodies up to 2 lines may omit braces               |
+| `readability-braces-around-statements.ShortStatementLines` | `0`              | Braces required for all control-flow bodies, even single-line  |

From 1d1dbfd290c2106044aa2035c9dceb744f586ba9 Mon Sep 17 00:00:00 2001
From: mkolodner <mkolodner@snapchat.com>
Date: Fri, 10 Apr 2026 21:38:11 +0000
Subject: [PATCH 13/14] Update

---
 Makefile                             |  4 +--
 gigl/scripts/post_install.py         | 47 +++++++---------------------
 requirements/install_cpp_deps.sh     | 12 ++++---
 scripts/build_cpp_extensions.py      | 10 +++---
 scripts/cpp_build_constants.py       | 17 ++++++++++
 scripts/generate_compile_commands.py | 38 ++++++++--------------
 6 files changed, 55 insertions(+), 73 deletions(-)
 create mode 100644 scripts/cpp_build_constants.py

diff --git a/Makefile b/Makefile
index 28bbd3d18..79e2a02ba 100644
--- a/Makefile
+++ b/Makefile
@@ -77,7 +77,7 @@ assert_yaml_configs_parse:
 # Ex. `make unit_test_py PY_TEST_FILES="eval_metrics_test.py"`
 # By default, runs all tests under tests/unit.
 # See the help text for "--test_file_pattern" in tests/test_args.py for more details.
-unit_test_py: clean_build_files_py type_check build_cpp_extensions
+unit_test_py: clean_build_files_py build_cpp_extensions type_check
 	uv run python -m tests.unit.main \
 		--env=test \
 		--resource_config_uri=${GIGL_TEST_DEFAULT_RESOURCE_CONFIG} \
@@ -119,7 +119,7 @@ check_format_md:
 check_format_cpp:
 	$(if $(CPP_SOURCES), clang-format --dry-run --Werror --style=file $(CPP_SOURCES))
 
-check_format: check_format_py check_format_scala check_format_md check_format_cpp
+check_format: check_format_py check_format_cpp check_format_scala check_format_md
 
 # Set PY_TEST_FILES=<TEST_FILE_NAME_GLOB> to test a specifc file.
 # Ex. `make integration_test PY_TEST_FILES="dataflow_test.py"`
diff --git a/gigl/scripts/post_install.py b/gigl/scripts/post_install.py
index d31b4244a..bafc4ae21 100644
--- a/gigl/scripts/post_install.py
+++ b/gigl/scripts/post_install.py
@@ -11,28 +11,6 @@
 import subprocess
 import sys
 from pathlib import Path
-from typing import Optional
-
-
-def run_command_and_stream_stdout(cmd: str) -> Optional[int]:
-    """
-    Executes a command and streams the stdout output.
-
-    Args:
-        cmd (str): The command to be executed.
-
-    Returns:
-        Optional[int]: The return code of the command, or None if the command failed to execute.
-    """
-    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
-    while True:
-        output = process.stdout.readline()  # type: ignore
-        if output == b"" and process.poll() is not None:
-            break
-        if output:
-            print(output.strip())
-    return_code: Optional[int] = process.poll()
-    return return_code
 
 
 def main():
@@ -49,26 +27,26 @@ def main():
         sys.exit(1)
 
     try:
-        print(f"Executing bash {install_glt_script}...")
-        result = run_command_and_stream_stdout(f"bash {install_glt_script}")
-        print("GLT install finished with return code:", result)
-    except Exception as e:
-        print(f"Unexpected error: {e}")
+        print(f"Installing GLT via {install_glt_script}...")
+        subprocess.run(["bash", str(install_glt_script)], check=True)
+        print("GLT install finished.")
+    except subprocess.CalledProcessError as e:
+        print(f"Error installing GLT: {e}")
         sys.exit(1)
 
     # Step 2: Build pybind11 C++ extensions in-place so they are importable
     # without requiring a separate `make build_cpp_extensions` call.
     # subprocess.run streams stdout/stderr to the terminal and raises
     # CalledProcessError on a non-zero exit code.
+    build_cpp_script = repo_root / "scripts" / "build_cpp_extensions.py"
+    if not build_cpp_script.exists():
+        print(f"Error: build_cpp_extensions.py not found at {build_cpp_script}")
+        sys.exit(1)
+
     try:
         print("Building C++ extensions...")
         subprocess.run(
-            [
-                sys.executable,
-                "scripts/build_cpp_extensions.py",
-                "build_ext",
-                "--inplace",
-            ],
+            [sys.executable, str(build_cpp_script), "build_ext", "--inplace"],
             cwd=repo_root,
             check=True,
         )
@@ -76,9 +54,6 @@ def main():
     except subprocess.CalledProcessError as e:
         print(f"Error building C++ extensions: {e}")
         sys.exit(1)
-    except Exception as e:
-        print(f"Unexpected error building C++ extensions: {e}")
-        sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/requirements/install_cpp_deps.sh b/requirements/install_cpp_deps.sh
index 6b9f19113..1e1a6cf16 100644
--- a/requirements/install_cpp_deps.sh
+++ b/requirements/install_cpp_deps.sh
@@ -6,10 +6,15 @@
 #
 # Called by `make install_dev_deps` alongside install_py_deps.sh and
 # install_scala_deps.sh.
+#
+# NOTE: On Linux, this script calls apt-get and update-alternatives, which
+# require root privileges. Run as root or prefix with sudo.
 
 set -e
 set -x
 
+CLANG_VERSION=15
+
 is_running_on_mac() {
     [ "$(uname)" == "Darwin" ]
     return $?
@@ -35,12 +40,11 @@ if is_running_on_mac; then
     # install_dev_deps to pick up the PATH change.
     export PATH="$LLVM_BIN:$PATH"
 else
-    # Ubuntu / Debian — clang 15 is the highest version available on Ubuntu 22.04.
-    apt-get install -y clang-format-15 clang-tidy-15 cmake
+    apt-get install -y "clang-format-${CLANG_VERSION}" "clang-tidy-${CLANG_VERSION}" cmake
     # Register versioned binaries as the default so bare `clang-format` and
     # `clang-tidy` resolve to them without callers specifying the version suffix.
-    update-alternatives --install /usr/bin/clang-format clang-format /usr/bin/clang-format-15 100
-    update-alternatives --install /usr/bin/clang-tidy   clang-tidy   /usr/bin/clang-tidy-15   100
+    update-alternatives --install /usr/bin/clang-format clang-format "/usr/bin/clang-format-${CLANG_VERSION}" 100
+    update-alternatives --install /usr/bin/clang-tidy   clang-tidy   "/usr/bin/clang-tidy-${CLANG_VERSION}"   100
 fi
 
 echo "Finished installing C++ tooling"
diff --git a/scripts/build_cpp_extensions.py b/scripts/build_cpp_extensions.py
index 5bc3dd8bf..14a6d3911 100644
--- a/scripts/build_cpp_extensions.py
+++ b/scripts/build_cpp_extensions.py
@@ -8,13 +8,11 @@
     python build_cpp_extensions.py build_ext --inplace
 """
 
-from pathlib import Path
-
 from setuptools import setup
 from setuptools.extension import Extension
 from torch.utils.cpp_extension import BuildExtension, CppExtension
 
-_CSRC_DIR = Path("gigl/csrc")
+from cpp_build_constants import COMPILE_ARGS, CSRC_DIR
 
 
 def find_cpp_extensions() -> list[Extension]:
@@ -25,10 +23,10 @@ def find_cpp_extensions() -> list[Extension]:
 
     Returns an empty list if ``gigl/csrc/`` does not yet exist.
     """
-    if not _CSRC_DIR.exists():
+    if not CSRC_DIR.exists():
         return []
     extensions = []
-    for cpp_file in sorted(_CSRC_DIR.rglob("python_*.cpp")):
+    for cpp_file in sorted(CSRC_DIR.rglob("python_*.cpp")):
         parts = list(cpp_file.with_suffix("").parts)
         parts[-1] = parts[-1].removeprefix("python_")
         module_name = ".".join(parts)
@@ -40,7 +38,7 @@ def find_cpp_extensions() -> list[Extension]:
             CppExtension(
                 name=module_name,
                 sources=sources,
-                extra_compile_args=["-O3", "-std=c++17", "-Wall", "-Wextra"],
+                extra_compile_args=COMPILE_ARGS,
             )
         )
     return extensions
diff --git a/scripts/cpp_build_constants.py b/scripts/cpp_build_constants.py
new file mode 100644
index 000000000..ab1c7cc36
--- /dev/null
+++ b/scripts/cpp_build_constants.py
@@ -0,0 +1,17 @@
+"""Shared C++ build constants for build_cpp_extensions.py and generate_compile_commands.py.
+
+This is the single source of truth for C++ compiler flags and source paths.
+Both scripts import from here so clang-tidy always analyzes with the same flags
+used during the actual build.
+"""
+
+from pathlib import Path
+
+# REPO_ROOT is derived from this file's location — this file must live in scripts/.
+REPO_ROOT: Path = Path(__file__).resolve().parent.parent
+CSRC_DIR: Path = REPO_ROOT / "gigl" / "csrc"
+
+# Flags passed to every C++ compilation unit. Applies to both the extension
+# build (build_cpp_extensions.py) and the compile_commands.json used by
+# clang-tidy (generate_compile_commands.py).
+COMPILE_ARGS: list[str] = ["-O3", "-std=c++17", "-Wall", "-Wextra"]
diff --git a/scripts/generate_compile_commands.py b/scripts/generate_compile_commands.py
index eec176848..d42d52682 100644
--- a/scripts/generate_compile_commands.py
+++ b/scripts/generate_compile_commands.py
@@ -9,58 +9,46 @@
     uv run python scripts/generate_compile_commands.py
 
 Output: ``build/compile_commands.json`` (created or overwritten).
+
+Note: run ``make build_cpp_extensions`` before this script (or use ``make lint_cpp``,
+which does both in the correct order) so the database reflects the current build state.
 """
 
 import json
-import subprocess
 import sys
 import sysconfig
-from pathlib import Path
 
 from torch.utils.cpp_extension import include_paths as torch_include_paths
 
+from cpp_build_constants import COMPILE_ARGS, CSRC_DIR, REPO_ROOT
 
-def main() -> None:
-    repo_root = Path(__file__).parent.parent.resolve()
-
-    # Always rebuild C++ extensions before generating compile_commands.json so
-    # the database reflects the current state of the code.
-    subprocess.run(
-        [sys.executable, "scripts/build_cpp_extensions.py", "build_ext", "--inplace"],
-        cwd=repo_root,
-        check=True,
-    )
 
+def main() -> None:
     # Collect all include directories needed to compile the extension.
     # torch_include_paths() returns the torch headers, which already bundle
     # pybind11 under torch/include/pybind11/ — no separate pybind11 import needed.
-    include_flags: list[str] = []
-    for path in torch_include_paths():
-        include_flags.append(f"-I{path}")
+    include_flags: list[str] = [f"-I{path}" for path in torch_include_paths()]
     # Python C API headers (e.g. Python.h) required by pybind11.
     include_flags.append(f"-I{sysconfig.get_path('include')}")
 
-    cpp_dir = repo_root / "gigl" / "csrc"
-    cpp_sources = sorted(cpp_dir.rglob("*.cpp")) if cpp_dir.exists() else []
+    cpp_sources = sorted(CSRC_DIR.rglob("*.cpp")) if CSRC_DIR.exists() else []
     if not cpp_sources:
-        print("Warning: no .cpp files found under gigl/csrc/", file=sys.stderr)
+        print(f"Warning: no .cpp files found under {CSRC_DIR}", file=sys.stderr)
+
+    cxx_flags = " ".join(COMPILE_ARGS)
 
     # Each entry in compile_commands.json describes how one source file is compiled.
     # clang-tidy reads this to reproduce the exact compilation environment.
     commands: list[dict[str, str]] = [
         {
-            "directory": str(repo_root),
+            "directory": str(REPO_ROOT),
             "file": str(source),
-            "command": (
-                f"c++ -std=c++17 -Wall -Wextra "
-                f"{' '.join(include_flags)} "
-                f"-c {source}"
-            ),
+            "command": f"c++ {cxx_flags} {' '.join(include_flags)} -c {source}",
         }
         for source in cpp_sources
     ]
 
-    output = repo_root / "build" / "compile_commands.json"
+    output = REPO_ROOT / "build" / "compile_commands.json"
     output.parent.mkdir(exist_ok=True)
     output.write_text(json.dumps(commands, indent=2))
     print(

From fe2cc0b672df8a25e531a90f2c95e023df5485d0 Mon Sep 17 00:00:00 2001
From: mkolodner <mkolodner@snapchat.com>
Date: Tue, 14 Apr 2026 00:05:43 +0000
Subject: [PATCH 14/14] Update type check

---
 gigl/csrc/sampling/ppr_forward_push.cpp       | 284 +++++++++---------
 gigl/csrc/sampling/ppr_forward_push.h         |  87 +++---
 .../csrc/sampling/python_ppr_forward_push.cpp |  34 +--
 3 files changed, 210 insertions(+), 195 deletions(-)

diff --git a/gigl/csrc/sampling/ppr_forward_push.cpp b/gigl/csrc/sampling/ppr_forward_push.cpp
index 436967cd7..f97b2f40c 100644
--- a/gigl/csrc/sampling/ppr_forward_push.cpp
+++ b/gigl/csrc/sampling/ppr_forward_push.cpp
@@ -1,209 +1,218 @@
 #include "ppr_forward_push.h"
 
-PPRForwardPushState::PPRForwardPushState(torch::Tensor seed_nodes,
-                                         int32_t seed_node_type_id,
+PPRForwardPushState::PPRForwardPushState(const torch::Tensor& seedNodes,
+                                         int32_t seedNodeTypeId,
                                          double alpha,
-                                         double requeue_threshold_factor,
-                                         std::vector<std::vector<int32_t>> node_type_to_edge_type_ids,
-                                         std::vector<int32_t> edge_type_to_dst_ntype_id,
-                                         std::vector<torch::Tensor> degree_tensors)
-    : alpha_(alpha),
-      one_minus_alpha_(1.0 - alpha),
-      requeue_threshold_factor_(requeue_threshold_factor),
+                                         double requeueThresholdFactor,
+                                         std::vector<std::vector<int32_t>> nodeTypeToEdgeTypeIds,
+                                         std::vector<int32_t> edgeTypeToDstNtypeId,
+                                         std::vector<torch::Tensor> degreeTensors)
+    : _alpha(alpha),
+      _oneMinusAlpha(1.0 - alpha),
+      _requeueThresholdFactor(requeueThresholdFactor),
       // std::move transfers ownership of each vector into the member variable
       // without copying its contents — equivalent to Python's list hand-off
       // when you no longer need the original.
-      node_type_to_edge_type_ids_(std::move(node_type_to_edge_type_ids)),
-      edge_type_to_dst_ntype_id_(std::move(edge_type_to_dst_ntype_id)),
-      degree_tensors_(std::move(degree_tensors)) {
-    TORCH_CHECK(seed_nodes.dim() == 1, "seed_nodes must be 1D");
-    batch_size_ = static_cast<int32_t>(seed_nodes.size(0));
-    num_node_types_ = static_cast<int32_t>(node_type_to_edge_type_ids_.size());
+      _nodeTypeToEdgeTypeIds(std::move(nodeTypeToEdgeTypeIds)),
+      _edgeTypeToDstNtypeId(std::move(edgeTypeToDstNtypeId)),
+      _degreeTensors(std::move(degreeTensors)) {
+    TORCH_CHECK(seedNodes.dim() == 1, "seedNodes must be 1D");
+    _batchSize = static_cast<int32_t>(seedNodes.size(0));
+    _numNodeTypes = static_cast<int32_t>(_nodeTypeToEdgeTypeIds.size());
 
     // Allocate per-seed, per-node-type tables.
     // .assign(n, val) fills a vector with n copies of val — like [val] * n in Python.
-    ppr_scores_.assign(batch_size_, std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
-    residuals_.assign(batch_size_, std::vector<std::unordered_map<int32_t, double>>(num_node_types_));
-    queue_.assign(batch_size_, std::vector<std::unordered_set<int32_t>>(num_node_types_));
-    queued_nodes_.assign(batch_size_, std::vector<std::unordered_set<int32_t>>(num_node_types_));
+    _pprScores.assign(_batchSize, std::vector<std::unordered_map<int32_t, double>>(_numNodeTypes));
+    _residuals.assign(_batchSize, std::vector<std::unordered_map<int32_t, double>>(_numNodeTypes));
+    _queue.assign(_batchSize, std::vector<std::unordered_set<int32_t>>(_numNodeTypes));
+    _queuedNodes.assign(_batchSize, std::vector<std::unordered_set<int32_t>>(_numNodeTypes));
 
     // accessor<dtype, ndim>() returns a typed view into the tensor's data that
     // supports [i] indexing with bounds checking in debug builds.
-    auto acc = seed_nodes.accessor<int64_t, 1>();
-    num_nodes_in_queue_ = batch_size_;
-    for (int32_t i = 0; i < batch_size_; ++i) {
-        int32_t seed = static_cast<int32_t>(acc[i]);
+    auto acc = seedNodes.accessor<int64_t, 1>();
+    _numNodesInQueue = _batchSize;
+    for (int32_t i = 0; i < _batchSize; ++i) {
+        auto seed = static_cast<int32_t>(acc[i]);
         // PPR initialisation: each seed starts with residual = alpha (the
         // restart probability).  The first push will move alpha into ppr_score
         // and distribute (1-alpha)*alpha to the seed's neighbors.
-        residuals_[i][seed_node_type_id][seed] = alpha_;
-        queue_[i][seed_node_type_id].insert(seed);
+        _residuals[i][seedNodeTypeId][seed] = _alpha;
+        _queue[i][seedNodeTypeId].insert(seed);
     }
 }
 
-std::optional<std::unordered_map<int32_t, torch::Tensor>> PPRForwardPushState::drain_queue() {
-    if (num_nodes_in_queue_ == 0) {
+std::optional<std::unordered_map<int32_t, torch::Tensor>> PPRForwardPushState::drainQueue() {
+    if (_numNodesInQueue == 0) {
         return std::nullopt;
     }
 
     // Reset the snapshot from the previous iteration.
-    for (int32_t s = 0; s < batch_size_; ++s)
-        for (auto& qs : queued_nodes_[s])
+    for (int32_t s = 0; s < _batchSize; ++s) {
+        for (auto& qs : _queuedNodes[s]) {
             qs.clear();
+        }
+    }
 
-    // nodes_to_lookup[eid] = set of node IDs that need a neighbor fetch for
+    // nodesToLookup[eid] = set of node IDs that need a neighbor fetch for
     // edge type eid this round.  Using a set deduplicates nodes that appear
     // in multiple seeds' queues: we only fetch each (node, etype) pair once.
-    std::unordered_map<int32_t, std::unordered_set<int32_t>> nodes_to_lookup;
+    std::unordered_map<int32_t, std::unordered_set<int32_t>> nodesToLookup;
 
-    int32_t total_drained_this_round = 0;
-    for (int32_t s = 0; s < batch_size_; ++s) {
-        for (int32_t nt = 0; nt < num_node_types_; ++nt) {
-            if (queue_[s][nt].empty())
+    int32_t totalDrainedThisRound = 0;
+    for (int32_t s = 0; s < _batchSize; ++s) {
+        for (int32_t nt = 0; nt < _numNodeTypes; ++nt) {
+            if (_queue[s][nt].empty()) {
                 continue;
+            }
 
             // Move the live queue into the snapshot (no data copy — O(1)).
-            queued_nodes_[s][nt] = std::move(queue_[s][nt]);
-            queue_[s][nt].clear();
-            total_drained_this_round += static_cast<int32_t>(queued_nodes_[s][nt].size());
-            num_nodes_in_queue_ -= static_cast<int32_t>(queued_nodes_[s][nt].size());
-
-            for (int32_t node_id : queued_nodes_[s][nt]) {
-                for (int32_t eid : node_type_to_edge_type_ids_[nt]) {
-                    if (neighbor_cache_.find(pack_key(node_id, eid)) == neighbor_cache_.end()) {
-                        nodes_to_lookup[eid].insert(node_id);
+            _queuedNodes[s][nt] = std::move(_queue[s][nt]);
+            _queue[s][nt].clear();
+            totalDrainedThisRound += static_cast<int32_t>(_queuedNodes[s][nt].size());
+            _numNodesInQueue -= static_cast<int32_t>(_queuedNodes[s][nt].size());
+
+            for (int32_t nodeId : _queuedNodes[s][nt]) {
+                for (int32_t eid : _nodeTypeToEdgeTypeIds[nt]) {
+                    if (_neighborCache.find(packKey(nodeId, eid)) == _neighborCache.end()) {
+                        nodesToLookup[eid].insert(nodeId);
                     }
                 }
             }
         }
     }
 
-    nodes_drained_per_iteration_.push_back(total_drained_this_round);
+    _nodesDrainedPerIteration.push_back(totalDrainedThisRound);
 
     std::unordered_map<int32_t, torch::Tensor> result;
-    for (auto& [eid, node_set] : nodes_to_lookup) {
-        std::vector<int64_t> ids(node_set.begin(), node_set.end());
+    for (auto& [eid, nodeSet] : nodesToLookup) {
+        std::vector<int64_t> ids(nodeSet.begin(), nodeSet.end());
         result[eid] = torch::tensor(ids, torch::kLong);
     }
     return result;
 }
 
-const std::vector<int32_t>& PPRForwardPushState::get_nodes_drained_per_iteration() const {
-    return nodes_drained_per_iteration_;
+const std::vector<int32_t>& PPRForwardPushState::getNodesDrainedPerIteration() const {
+    return _nodesDrainedPerIteration;
 }
 
-void PPRForwardPushState::push_residuals(
-    const std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>& fetched_by_etype_id) {
-    // Step 1: Unpack the input map into a C++ map keyed by pack_key(node_id, etype_id)
+void PPRForwardPushState::pushResiduals(
+    const std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>& fetchedByEtypeId) {
+    // Step 1: Unpack the input map into a C++ map keyed by packKey(nodeId, etypeId)
     // for fast lookup during the residual-push loop below.
     std::unordered_map<uint64_t, std::vector<int32_t>> fetched;
-    for (const auto& [eid, tup] : fetched_by_etype_id) {
-        const auto& node_ids_t = std::get<0>(tup);
-        const auto& flat_nbrs_t = std::get<1>(tup);
-        const auto& counts_t = std::get<2>(tup);
+    for (const auto& [eid, tup] : fetchedByEtypeId) {
+        const auto& nodeIdsT = std::get<0>(tup);
+        const auto& flatNbrsT = std::get<1>(tup);
+        const auto& countsT = std::get<2>(tup);
 
         // accessor<int64_t, 1>() gives a bounds-checked, typed 1-D view into
         // each tensor's data — equivalent to iterating over a NumPy array.
-        auto node_acc = node_ids_t.accessor<int64_t, 1>();
-        auto nbr_acc = flat_nbrs_t.accessor<int64_t, 1>();
-        auto cnt_acc = counts_t.accessor<int64_t, 1>();
+        auto nodeAcc = nodeIdsT.accessor<int64_t, 1>();
+        auto nbrAcc = flatNbrsT.accessor<int64_t, 1>();
+        auto cntAcc = countsT.accessor<int64_t, 1>();
 
         // Walk the flat neighbor list, slicing out each node's neighbors using
         // the running offset into the concatenated flat buffer.
         int64_t offset = 0;
-        for (int64_t i = 0; i < node_ids_t.size(0); ++i) {
-            int32_t nid = static_cast<int32_t>(node_acc[i]);
-            int64_t count = cnt_acc[i];
+        for (int64_t i = 0; i < nodeIdsT.size(0); ++i) {
+            auto nid = static_cast<int32_t>(nodeAcc[i]);
+            int64_t count = cntAcc[i];
             std::vector<int32_t> nbrs(count);
-            for (int64_t j = 0; j < count; ++j)
-                nbrs[j] = static_cast<int32_t>(nbr_acc[offset + j]);
-            fetched[pack_key(nid, eid)] = std::move(nbrs);
+            for (int64_t j = 0; j < count; ++j) {
+                nbrs[j] = static_cast<int32_t>(nbrAcc[offset + j]);
+            }
+            fetched[packKey(nid, eid)] = std::move(nbrs);
             offset += count;
         }
     }
 
-    // Step 2: For every node that was in the queue (captured in queued_nodes_
-    // by drain_queue()), apply one PPR push step:
+    // Step 2: For every node that was in the queue (captured in _queuedNodes
+    // by drainQueue()), apply one PPR push step:
     //   a. Absorb residual into the PPR score.
     //   b. Distribute (1-alpha) * residual equally to each neighbor.
     //   c. Enqueue any neighbor whose residual now exceeds the requeue threshold.
-    for (int32_t s = 0; s < batch_size_; ++s) {
-        for (int32_t nt = 0; nt < num_node_types_; ++nt) {
-            if (queued_nodes_[s][nt].empty())
+    for (int32_t s = 0; s < _batchSize; ++s) {
+        for (int32_t nt = 0; nt < _numNodeTypes; ++nt) {
+            if (_queuedNodes[s][nt].empty()) {
                 continue;
+            }
 
-            for (int32_t src : queued_nodes_[s][nt]) {
-                auto& src_res = residuals_[s][nt];
-                auto it = src_res.find(src);
-                double res = (it != src_res.end()) ? it->second : 0.0;
+            for (int32_t src : _queuedNodes[s][nt]) {
+                auto& srcRes = _residuals[s][nt];
+                auto it = srcRes.find(src);
+                double res = (it != srcRes.end()) ? it->second : 0.0;
 
                 // a. Absorb: move residual into the PPR score.
-                ppr_scores_[s][nt][src] += res;
-                src_res[src] = 0.0;
+                _pprScores[s][nt][src] += res;
+                srcRes[src] = 0.0;
 
                 // b. Count total fetched/cached neighbors across all edge types for
                 // this source node.  We normalise by the number of neighbors we
                 // actually retrieved, not the true degree, so residual is fully
                 // distributed among known neighbors rather than leaking to unfetched
                 // ones (which matters when num_neighbors_per_hop < true_degree).
-                int32_t total_fetched = 0;
-                for (int32_t eid : node_type_to_edge_type_ids_[nt]) {
-                    auto fi = fetched.find(pack_key(src, eid));
+                int32_t totalFetched = 0;
+                for (int32_t eid : _nodeTypeToEdgeTypeIds[nt]) {
+                    auto fi = fetched.find(packKey(src, eid));
                     if (fi != fetched.end()) {
-                        total_fetched += static_cast<int32_t>(fi->second.size());
+                        totalFetched += static_cast<int32_t>(fi->second.size());
                     } else {
-                        auto ci = neighbor_cache_.find(pack_key(src, eid));
-                        if (ci != neighbor_cache_.end())
-                            total_fetched += static_cast<int32_t>(ci->second.size());
+                        auto ci = _neighborCache.find(packKey(src, eid));
+                        if (ci != _neighborCache.end()) {
+                            totalFetched += static_cast<int32_t>(ci->second.size());
+                        }
                     }
                 }
                 // Destination-only nodes (or nodes with no fetched neighbors) absorb
                 // residual but do not push further.
-                if (total_fetched == 0)
+                if (totalFetched == 0) {
                     continue;
+                }
 
-                double res_per_nbr = one_minus_alpha_ * res / static_cast<double>(total_fetched);
+                double resPerNbr = _oneMinusAlpha * res / static_cast<double>(totalFetched);
 
-                for (int32_t eid : node_type_to_edge_type_ids_[nt]) {
-                    // Invariant: fetched and neighbor_cache_ are mutually exclusive for
-                    // any given (node, etype) key within one iteration.  drain_queue()
-                    // only requests a fetch for nodes absent from neighbor_cache_, so a
+                for (int32_t eid : _nodeTypeToEdgeTypeIds[nt]) {
+                    // Invariant: fetched and _neighborCache are mutually exclusive for
+                    // any given (node, etype) key within one iteration.  drainQueue()
+                    // only requests a fetch for nodes absent from _neighborCache, so a
                     // key is in at most one of the two.
-                    const std::vector<int32_t>* nbr_list = nullptr;
-                    auto fi = fetched.find(pack_key(src, eid));
+                    const std::vector<int32_t>* nbrList = nullptr;
+                    auto fi = fetched.find(packKey(src, eid));
                     if (fi != fetched.end()) {
-                        nbr_list = &fi->second;
+                        nbrList = &fi->second;
                     } else {
-                        auto ci = neighbor_cache_.find(pack_key(src, eid));
-                        if (ci != neighbor_cache_.end())
-                            nbr_list = &ci->second;
+                        auto ci = _neighborCache.find(packKey(src, eid));
+                        if (ci != _neighborCache.end()) {
+                            nbrList = &ci->second;
+                        }
                     }
-                    if (!nbr_list || nbr_list->empty())
+                    if (!nbrList || nbrList->empty()) {
                         continue;
+                    }
 
-                    int32_t dst_nt = edge_type_to_dst_ntype_id_[eid];
+                    int32_t dstNt = _edgeTypeToDstNtypeId[eid];
 
                     // c. Accumulate residual for each neighbor and re-enqueue if threshold
                     // exceeded.
-                    for (int32_t nbr : *nbr_list) {
-                        residuals_[s][dst_nt][nbr] += res_per_nbr;
+                    for (int32_t nbr : *nbrList) {
+                        _residuals[s][dstNt][nbr] += resPerNbr;
 
-                        double threshold =
-                            requeue_threshold_factor_ * static_cast<double>(get_total_degree(nbr, dst_nt));
+                        double threshold = _requeueThresholdFactor * static_cast<double>(getTotalDegree(nbr, dstNt));
 
-                        if (queue_[s][dst_nt].find(nbr) == queue_[s][dst_nt].end() &&
-                            residuals_[s][dst_nt][nbr] >= threshold) {
-                            queue_[s][dst_nt].insert(nbr);
-                            ++num_nodes_in_queue_;
+                        if (_queue[s][dstNt].find(nbr) == _queue[s][dstNt].end() &&
+                            _residuals[s][dstNt][nbr] >= threshold) {
+                            _queue[s][dstNt].insert(nbr);
+                            ++_numNodesInQueue;
 
                             // Promote neighbor lists to the persistent cache: this node will
                             // be processed next iteration, so caching avoids a re-fetch.
-                            for (int32_t peid : node_type_to_edge_type_ids_[dst_nt]) {
-                                uint64_t pk = pack_key(nbr, peid);
-                                if (neighbor_cache_.find(pk) == neighbor_cache_.end()) {
+                            for (int32_t peid : _nodeTypeToEdgeTypeIds[dstNt]) {
+                                uint64_t pk = packKey(nbr, peid);
+                                if (_neighborCache.find(pk) == _neighborCache.end()) {
                                     auto pfi = fetched.find(pk);
-                                    if (pfi != fetched.end())
-                                        neighbor_cache_[pk] = pfi->second;
+                                    if (pfi != fetched.end()) {
+                                        _neighborCache[pk] = pfi->second;
+                                    }
                                 }
                             }
                         }
@@ -214,23 +223,26 @@ void PPRForwardPushState::push_residuals(
     }
 }
 
-std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> PPRForwardPushState::extract_top_k(
-    int32_t max_ppr_nodes) {
+std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> PPRForwardPushState::extractTopK(
+    int32_t maxPprNodes) {
     std::unordered_set<int32_t> active;
-    for (int32_t s = 0; s < batch_size_; ++s)
-        for (int32_t nt = 0; nt < num_node_types_; ++nt)
-            if (!ppr_scores_[s][nt].empty())
+    for (int32_t s = 0; s < _batchSize; ++s) {
+        for (int32_t nt = 0; nt < _numNodeTypes; ++nt) {
+            if (!_pprScores[s][nt].empty()) {
                 active.insert(nt);
+            }
+        }
+    }
 
     std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> result;
     for (int32_t nt : active) {
-        std::vector<int64_t> flat_ids;
-        std::vector<float> flat_weights;
-        std::vector<int64_t> valid_counts;
+        std::vector<int64_t> flatIds;
+        std::vector<float> flatWeights;
+        std::vector<int64_t> validCounts;
 
-        for (int32_t s = 0; s < batch_size_; ++s) {
-            const auto& scores = ppr_scores_[s][nt];
-            int32_t k = std::min(max_ppr_nodes, static_cast<int32_t>(scores.size()));
+        for (int32_t s = 0; s < _batchSize; ++s) {
+            const auto& scores = _pprScores[s][nt];
+            int32_t k = std::min(maxPprNodes, static_cast<int32_t>(scores.size()));
             if (k > 0) {
                 std::vector<std::pair<int32_t, double>> items(scores.begin(), scores.end());
                 std::partial_sort(items.begin(), items.begin() + k, items.end(), [](const auto& a, const auto& b) {
@@ -238,36 +250,38 @@ std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tens
                 });
 
                 for (int32_t i = 0; i < k; ++i) {
-                    flat_ids.push_back(static_cast<int64_t>(items[i].first));
+                    flatIds.push_back(static_cast<int64_t>(items[i].first));
                     // Cast to float32 for output; internal scores stay double to
                     // avoid accumulated rounding errors in the push loop.
-                    flat_weights.push_back(static_cast<float>(items[i].second));
+                    flatWeights.push_back(static_cast<float>(items[i].second));
                 }
             }
-            valid_counts.push_back(static_cast<int64_t>(k));
+            validCounts.push_back(static_cast<int64_t>(k));
         }
 
-        result[nt] = {torch::tensor(flat_ids, torch::kLong),
-                      torch::tensor(flat_weights, torch::kFloat),
-                      torch::tensor(valid_counts, torch::kLong)};
+        result[nt] = {torch::tensor(flatIds, torch::kLong),
+                      torch::tensor(flatWeights, torch::kFloat),
+                      torch::tensor(validCounts, torch::kLong)};
     }
     return result;
 }
 
-int32_t PPRForwardPushState::get_total_degree(int32_t node_id, int32_t ntype_id) const {
-    if (ntype_id >= static_cast<int32_t>(degree_tensors_.size()))
+int32_t PPRForwardPushState::getTotalDegree(int32_t nodeId, int32_t ntypeId) const {
+    if (ntypeId >= static_cast<int32_t>(_degreeTensors.size())) {
         return 0;
-    const auto& t = degree_tensors_[ntype_id];
-    if (t.numel() == 0)
+    }
+    const auto& t = _degreeTensors[ntypeId];
+    if (t.numel() == 0) {
         return 0;
-    TORCH_CHECK(node_id < static_cast<int32_t>(t.size(0)),
+    }
+    TORCH_CHECK(nodeId < static_cast<int32_t>(t.size(0)),
                 "Node ID ",
-                node_id,
+                nodeId,
                 " out of range for degree tensor of ntype_id ",
-                ntype_id,
+                ntypeId,
                 " (size=",
                 t.size(0),
                 "). This indicates corrupted graph data or a sampler bug.");
     // data_ptr<int32_t>() returns a raw C pointer to the tensor's int32 data buffer.
-    return t.data_ptr<int32_t>()[node_id];
+    return t.data_ptr<int32_t>()[nodeId];
 }
diff --git a/gigl/csrc/sampling/ppr_forward_push.h b/gigl/csrc/sampling/ppr_forward_push.h
index 82973ff7a..7a6fac69d 100644
--- a/gigl/csrc/sampling/ppr_forward_push.h
+++ b/gigl/csrc/sampling/ppr_forward_push.h
@@ -22,9 +22,9 @@
 // negative int32_t (e.g. -1 = 0xFFFFFFFF) would be sign-extended to a full
 // 64-bit value, corrupting the upper bits when shifted.  Reinterpreting as
 // uint32_t first treats the bit pattern as-is (no sign extension).
-static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) {
-    return (static_cast<uint64_t>(static_cast<uint32_t>(node_id)) << 32) |
-           static_cast<uint32_t>(etype_id);
+static inline uint64_t packKey(int32_t nodeId, int32_t etypeId) {
+    return (static_cast<uint64_t>(static_cast<uint32_t>(nodeId)) << 32) |
+           static_cast<uint32_t>(etypeId);
 }
 
 // C++ kernel for the PPR Forward Push algorithm (Andersen et al., 2006).
@@ -33,43 +33,45 @@ static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) {
 // this object.  The distributed neighbor fetch is kept in Python because it
 // involves async RPC calls that C++ cannot drive directly.
 //
-// Owned state: ppr_scores, residuals, queue, queued_nodes, neighbor_cache.
+// Owned state: _pprScores, _residuals, _queue, _queuedNodes, _neighborCache.
 // Python retains ownership of: the distributed neighbor fetch (_batch_fetch_neighbors).
 //
 // Typical call sequence per batch:
-//   1.  PPRForwardPushState(seed_nodes, ...)   — init per-seed residuals / queue
+//   1.  PPRForwardPushState(seedNodes, ...)   — init per-seed residuals / queue
 //   while True:
-//   2.  drain_queue()                          — drain queue → nodes needing lookup
+//   2.  drainQueue()                          — drain queue → nodes needing lookup
 //   3.  <Python: _batch_fetch_neighbors(...)>  — distributed RPC fetch (stays in Python)
-//   4.  push_residuals(fetched_by_etype_id)    — push residuals, update queue
-//   5.  extract_top_k(max_ppr_nodes)           — top-k selection per seed per node type
+//   4.  pushResiduals(fetchedByEtypeId)        — push residuals, update queue
+//   5.  extractTopK(maxPprNodes)               — top-k selection per seed per node type
 class PPRForwardPushState {
    public:
-    PPRForwardPushState(torch::Tensor seed_nodes, int32_t seed_node_type_id, double alpha,
-                        double requeue_threshold_factor,
-                        std::vector<std::vector<int32_t>> node_type_to_edge_type_ids,
-                        std::vector<int32_t> edge_type_to_dst_ntype_id,
-                        std::vector<torch::Tensor> degree_tensors);
+    PPRForwardPushState(const torch::Tensor& seedNodes,
+                        int32_t seedNodeTypeId,
+                        double alpha,
+                        double requeueThresholdFactor,
+                        std::vector<std::vector<int32_t>> nodeTypeToEdgeTypeIds,
+                        std::vector<int32_t> edgeTypeToDstNtypeId,
+                        std::vector<torch::Tensor> degreeTensors);
 
     // Drain all queued nodes and return {etype_id: tensor[node_ids]} for batch
-    // neighbor lookup.  Also snapshots the drained nodes into queued_nodes_ for
-    // use by push_residuals().
+    // neighbor lookup.  Also snapshots the drained nodes into _queuedNodes for
+    // use by pushResiduals().
     //
     // Return value semantics:
     //   - std::nullopt   → queue was already empty; convergence achieved; stop the loop.
-    //   - empty map      → nodes were drained but all were cached; call push_residuals({}).
+    //   - empty map      → nodes were drained but all were cached; call pushResiduals({}).
     //   - non-empty map  → {etype_id → 1-D int64 tensor of node IDs} needing neighbor lookup.
-    std::optional<std::unordered_map<int32_t, torch::Tensor>> drain_queue();
+    std::optional<std::unordered_map<int32_t, torch::Tensor>> drainQueue();
 
     // Push residuals to neighbors given the fetched neighbor data.
     //
-    // fetched_by_etype_id: {etype_id: (node_ids_tensor, flat_nbrs_tensor, counts_tensor)}
+    // fetchedByEtypeId: {etype_id: (node_ids_tensor, flat_nbrs_tensor, counts_tensor)}
     //   - node_ids_tensor:  [N]           int64 — source node IDs fetched for this edge type
     //   - flat_nbrs_tensor: [sum(counts)] int64 — all neighbor lists concatenated flat
     //   - counts_tensor:    [N]           int64 — neighbor count for each source node
-    void push_residuals(const std::unordered_map<
-                        int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>&
-                            fetched_by_etype_id);
+    void pushResiduals(const std::unordered_map<
+                       int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>&
+                           fetchedByEtypeId);
 
     // Extract top-k PPR nodes per seed per node type.
     //
@@ -81,52 +83,51 @@ class PPRForwardPushState {
     //   flat_ids[valid_counts[0] : valid_counts[0]+valid_counts[1]] → top-k for seed 1
     //   ...
     std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>>
-    extract_top_k(int32_t max_ppr_nodes);
+    extractTopK(int32_t maxPprNodes);
+
+    // Returns _nodesDrainedPerIteration built up across all drainQueue() calls.
+    [[nodiscard]] const std::vector<int32_t>& getNodesDrainedPerIteration() const;
 
    private:
     // Look up the total (across all edge types) out-degree of a node.
     // Returns 0 for destination-only node types (no outgoing edges).
-    int32_t get_total_degree(int32_t node_id, int32_t ntype_id) const;
+    [[nodiscard]] int32_t getTotalDegree(int32_t nodeId, int32_t ntypeId) const;
 
     // -------------------------------------------------------------------------
     // Scalar algorithm parameters
     // -------------------------------------------------------------------------
-    double alpha_;            // Restart probability
-    double one_minus_alpha_;  // 1 - alpha, precomputed to avoid repeated subtraction
-    double requeue_threshold_factor_;  // alpha * eps; multiplied by degree to get per-node threshold
+    double _alpha;                    // Restart probability
+    double _oneMinusAlpha;            // 1 - alpha, precomputed to avoid repeated subtraction
+    double _requeueThresholdFactor;   // alpha * eps; multiplied by degree to get per-node threshold
 
-    int32_t batch_size_;             // Number of seeds in the current batch
-    int32_t num_node_types_;         // Total number of node types (homo + hetero)
-    int32_t num_nodes_in_queue_{0};  // Running count of nodes across all seeds / types
+    int32_t _batchSize;                    // Number of seeds in the current batch
+    int32_t _numNodeTypes;                 // Total number of node types (homo + hetero)
+    int32_t _numNodesInQueue{0};           // Running count of nodes across all seeds / types
 
     // -------------------------------------------------------------------------
     // Graph structure (read-only after construction)
     // -------------------------------------------------------------------------
-    std::vector<std::vector<int32_t>> node_type_to_edge_type_ids_;
-    std::vector<int32_t> edge_type_to_dst_ntype_id_;
-    std::vector<torch::Tensor> degree_tensors_;
+    std::vector<std::vector<int32_t>> _nodeTypeToEdgeTypeIds;
+    std::vector<int32_t> _edgeTypeToDstNtypeId;
+    std::vector<torch::Tensor> _degreeTensors;
 
     // -------------------------------------------------------------------------
     // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id])
     // -------------------------------------------------------------------------
-    std::vector<std::vector<std::unordered_map<int32_t, double>>> ppr_scores_;
-    std::vector<std::vector<std::unordered_map<int32_t, double>>> residuals_;
-    std::vector<std::vector<std::unordered_set<int32_t>>> queue_;
-    std::vector<std::vector<std::unordered_set<int32_t>>> queued_nodes_;
+    std::vector<std::vector<std::unordered_map<int32_t, double>>> _pprScores;
+    std::vector<std::vector<std::unordered_map<int32_t, double>>> _residuals;
+    std::vector<std::vector<std::unordered_set<int32_t>>> _queue;
+    std::vector<std::vector<std::unordered_set<int32_t>>> _queuedNodes;
 
     // -------------------------------------------------------------------------
     // Neighbor cache
     // -------------------------------------------------------------------------
-    std::unordered_map<uint64_t, std::vector<int32_t>> neighbor_cache_;
+    std::unordered_map<uint64_t, std::vector<int32_t>> _neighborCache;
 
     // -------------------------------------------------------------------------
     // Diagnostics (populated during the algorithm; read after convergence)
     // -------------------------------------------------------------------------
-    // Total nodes drained (across all seeds and node types) in each drain_queue()
+    // Total nodes drained (across all seeds and node types) in each drainQueue()
     // call.  One entry per loop iteration; useful for understanding convergence speed.
-    std::vector<int32_t> nodes_drained_per_iteration_;
-
-   public:
-    // Returns nodes_drained_per_iteration_ built up across all drain_queue() calls.
-    const std::vector<int32_t>& get_nodes_drained_per_iteration() const;
+    std::vector<int32_t> _nodesDrainedPerIteration;
 };
diff --git a/gigl/csrc/sampling/python_ppr_forward_push.cpp b/gigl/csrc/sampling/python_ppr_forward_push.cpp
index aafb32cdc..98ff40179 100644
--- a/gigl/csrc/sampling/python_ppr_forward_push.cpp
+++ b/gigl/csrc/sampling/python_ppr_forward_push.cpp
@@ -11,10 +11,10 @@
 
 namespace py = pybind11;
 
-// drain_queue: C++ returns std::optional<map<etype_id, Tensor>>.
+// drainQueue: C++ returns std::optional<map<etype_id, Tensor>>.
 // Exposed to Python as: None (convergence) or dict[int, Tensor].
-static py::object drain_queue_wrapper(PPRForwardPushState& self) {
-    auto result = self.drain_queue();
+static py::object drainQueueWrapper(PPRForwardPushState& self) {
+    auto result = self.drainQueue();
     if (!result) {
         return py::none();
     }
@@ -25,29 +25,29 @@ static py::object drain_queue_wrapper(PPRForwardPushState& self) {
     return d;
 }
 
-// push_residuals: Python passes dict[int, tuple[Tensor, Tensor, Tensor]].
+// pushResiduals: Python passes dict[int, tuple[Tensor, Tensor, Tensor]].
 // Convert to C++ map before delegating.
-static void push_residuals_wrapper(PPRForwardPushState& self, py::dict fetched_by_etype_id) {
-    std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> cpp_map;
+static void pushResidualsWrapper(PPRForwardPushState& self, const py::dict& fetchedByEtypeId) {
+    std::unordered_map<int32_t, std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>> cppMap;
     // Dict iteration touches Python objects — GIL must be held here.
-    for (auto item : fetched_by_etype_id) {
-        int32_t eid = item.first.cast<int32_t>();
+    for (auto item : fetchedByEtypeId) {
+        auto eid = item.first.cast<int32_t>();
         auto tup = item.second.cast<py::tuple>();
-        cpp_map[eid] = {tup[0].cast<torch::Tensor>(), tup[1].cast<torch::Tensor>(), tup[2].cast<torch::Tensor>()};
+        cppMap[eid] = {tup[0].cast<torch::Tensor>(), tup[1].cast<torch::Tensor>(), tup[2].cast<torch::Tensor>()};
     }
     // C++ push only uses tensor accessor/data_ptr APIs — GIL-safe to release.
     // Releasing here lets the asyncio event loop process RPC completion callbacks
     // from other concurrent PPR coroutines while this push runs.
     {
         py::gil_scoped_release release;
-        self.push_residuals(cpp_map);
+        self.pushResiduals(cppMap);
     }
 }
 
-// extract_top_k: C++ returns map<ntype_id, tuple<Tensor, Tensor, Tensor>>.
+// extractTopK: C++ returns map<ntype_id, tuple<Tensor, Tensor, Tensor>>.
 // Exposed to Python as dict[int, tuple[Tensor, Tensor, Tensor]].
-static py::dict extract_top_k_wrapper(PPRForwardPushState& self, int32_t max_ppr_nodes) {
-    auto result = self.extract_top_k(max_ppr_nodes);
+static py::dict extractTopKWrapper(PPRForwardPushState& self, int32_t maxPprNodes) {
+    auto result = self.extractTopK(maxPprNodes);
     py::dict d;
     for (auto& [nt, tup] : result) {
         d[py::int_(nt)] = py::make_tuple(std::get<0>(tup), std::get<1>(tup), std::get<2>(tup));
@@ -66,8 +66,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
                       std::vector<std::vector<int32_t>>,
                       std::vector<int32_t>,
                       std::vector<torch::Tensor>>())
-        .def("drain_queue", drain_queue_wrapper)
-        .def("push_residuals", push_residuals_wrapper)
-        .def("extract_top_k", extract_top_k_wrapper)
-        .def("get_nodes_drained_per_iteration", &PPRForwardPushState::get_nodes_drained_per_iteration);
+        .def("drain_queue", drainQueueWrapper)
+        .def("push_residuals", pushResidualsWrapper)
+        .def("extract_top_k", extractTopKWrapper)
+        .def("get_nodes_drained_per_iteration", &PPRForwardPushState::getNodesDrainedPerIteration);
 }