From 87dc95b08b289bcc814daa8aebe5eebe8012a5bd Mon Sep 17 00:00:00 2001 From: mkolodner Date: Tue, 24 Mar 2026 20:24:14 +0000 Subject: [PATCH 01/14] Initial commit --- gigl/distributed/cpp_extensions/__init__.py | 9 + .../cpp_extensions/ppr_forward_push.cpp | 294 +++++++++++++ .../cpp_extensions/ppr_forward_push.pyi | 21 + gigl/distributed/dist_ppr_sampler.py | 388 +++++++----------- 4 files changed, 463 insertions(+), 249 deletions(-) create mode 100644 gigl/distributed/cpp_extensions/__init__.py create mode 100644 gigl/distributed/cpp_extensions/ppr_forward_push.cpp create mode 100644 gigl/distributed/cpp_extensions/ppr_forward_push.pyi diff --git a/gigl/distributed/cpp_extensions/__init__.py b/gigl/distributed/cpp_extensions/__init__.py new file mode 100644 index 000000000..d375f59b1 --- /dev/null +++ b/gigl/distributed/cpp_extensions/__init__.py @@ -0,0 +1,9 @@ +try: + from gigl.distributed.cpp_extensions.ppr_forward_push import PPRForwardPushState +except ImportError as e: + raise ImportError( + "PPR C++ extension not compiled. " + "Run `uv pip install -e .` from the GiGL root to build it." + ) from e + +__all__ = ["PPRForwardPushState"] diff --git a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp new file mode 100644 index 000000000..6f6d10545 --- /dev/null +++ b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp @@ -0,0 +1,294 @@ +#include +#include + +#include +#include +#include +#include +#include + +namespace py = pybind11; + +// Pack (node_id, etype_id) into a single uint64_t lookup key. +// Requires both values fit in 32 bits — enforced by the Python caller. +static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) { + return (static_cast(static_cast(node_id)) << 32) | + static_cast(etype_id); +} + +// C++ kernel for the PPR Forward Push algorithm (Andersen et al., 2006). +// +// Owned state: ppr_scores, residuals, queue, queued_nodes, neighbor_cache. +// Python retains ownership of: the distributed neighbor fetch (_batch_fetch_neighbors). +// +// Typical call sequence per batch: +// 1. PPRForwardPushState(seed_nodes, ...) — init per-seed residuals / queue +// while True: +// 2. drain_queue() — drain queue → nodes needing lookup +// 3. — distributed RPC fetch (stays in Python) +// 4. push_residuals(fetched_by_etype_id) — push residuals, update queue +// 5. extract_top_k(max_ppr_nodes) — top-k selection per seed per node type +class PPRForwardPushState { +public: + PPRForwardPushState( + torch::Tensor seed_nodes, + int32_t seed_node_type_id, + float alpha, + float requeue_threshold_factor, + std::vector> node_type_to_edge_type_ids, + std::vector edge_type_to_dst_ntype_id, + std::vector degree_tensors + ) + : alpha_(alpha), + one_minus_alpha_(1.0f - alpha), + requeue_threshold_factor_(requeue_threshold_factor), + node_type_to_edge_type_ids_(std::move(node_type_to_edge_type_ids)), + edge_type_to_dst_ntype_id_(std::move(edge_type_to_dst_ntype_id)), + degree_tensors_(std::move(degree_tensors)) { + + TORCH_CHECK(seed_nodes.dim() == 1, "seed_nodes must be 1D"); + batch_size_ = static_cast(seed_nodes.size(0)); + num_node_types_ = static_cast(node_type_to_edge_type_ids_.size()); + + ppr_scores_.assign(batch_size_, std::vector>(num_node_types_)); + residuals_.assign(batch_size_, std::vector>(num_node_types_)); + queue_.assign(batch_size_, std::vector>(num_node_types_)); + queued_nodes_.assign(batch_size_, std::vector>(num_node_types_)); + + auto acc = seed_nodes.accessor(); + num_nodes_in_queue_ = batch_size_; + for (int32_t i = 0; i < batch_size_; ++i) { + int32_t seed = static_cast(acc[i]); + residuals_[i][seed_node_type_id][seed] = alpha_; + queue_[i][seed_node_type_id].insert(seed); + } + } + + // Drain all queued nodes and return {etype_id: tensor[node_ids]} for batch + // neighbor lookup. Also snapshots the drained nodes into queued_nodes_ for + // use by push_residuals(). + // + // Returns None when the queue is truly empty (convergence signal). + // Returns a dict (possibly empty) when nodes were drained but all had cached + // neighbors or no outgoing edges — push_residuals must still be called to + // flush their residuals into ppr_scores_. + py::object drain_queue() { + if (num_nodes_in_queue_ == 0) { + return py::none(); + } + + for (int32_t s = 0; s < batch_size_; ++s) + for (auto& qs : queued_nodes_[s]) qs.clear(); + + std::unordered_map> nodes_to_lookup; + + for (int32_t s = 0; s < batch_size_; ++s) { + for (int32_t nt = 0; nt < num_node_types_; ++nt) { + if (queue_[s][nt].empty()) continue; + + // Snapshot queue into queued_nodes, then reset queue. + queued_nodes_[s][nt] = std::move(queue_[s][nt]); + queue_[s][nt].clear(); + num_nodes_in_queue_ -= static_cast(queued_nodes_[s][nt].size()); + + for (int32_t node_id : queued_nodes_[s][nt]) { + for (int32_t eid : node_type_to_edge_type_ids_[nt]) { + // Only add to lookup if not already in the persistent cache. + if (neighbor_cache_.find(pack_key(node_id, eid)) == neighbor_cache_.end()) { + nodes_to_lookup[eid].insert(node_id); + } + } + } + } + } + + py::dict result; + for (auto& [eid, node_set] : nodes_to_lookup) { + std::vector ids(node_set.begin(), node_set.end()); + result[py::int_(eid)] = torch::tensor(ids, torch::kLong); + } + return result; + } + + // Push residuals to neighbors given the fetched neighbor data. + // fetched_by_etype_id: {etype_id: (node_ids_tensor, flat_nbrs_tensor, counts_tensor)} + // - node_ids_tensor: [N] int64 — source node IDs fetched for this edge type + // - flat_nbrs_tensor: [sum(counts)] int64 — flat concatenation of all neighbor lists + // - counts_tensor: [N] int64 — number of neighbors for each source node + void push_residuals(py::dict fetched_by_etype_id) { + // Build local fetched map: pack_key(node_id, etype_id) -> neighbor list. + std::unordered_map> fetched; + for (auto item : fetched_by_etype_id) { + int32_t eid = item.first.cast(); + auto tup = item.second.cast(); + auto node_ids_t = tup[0].cast(); + auto flat_nbrs_t = tup[1].cast(); + auto counts_t = tup[2].cast(); + + auto node_acc = node_ids_t.accessor(); + auto nbr_acc = flat_nbrs_t.accessor(); + auto cnt_acc = counts_t.accessor(); + + int64_t offset = 0; + for (int64_t i = 0; i < node_ids_t.size(0); ++i) { + int32_t nid = static_cast(node_acc[i]); + int64_t count = cnt_acc[i]; + std::vector nbrs(count); + for (int64_t j = 0; j < count; ++j) + nbrs[j] = static_cast(nbr_acc[offset + j]); + fetched[pack_key(nid, eid)] = std::move(nbrs); + offset += count; + } + } + + for (int32_t s = 0; s < batch_size_; ++s) { + for (int32_t nt = 0; nt < num_node_types_; ++nt) { + if (queued_nodes_[s][nt].empty()) continue; + + for (int32_t src : queued_nodes_[s][nt]) { + auto& src_res = residuals_[s][nt]; + auto it = src_res.find(src); + float res = (it != src_res.end()) ? it->second : 0.0f; + + ppr_scores_[s][nt][src] += res; + src_res[src] = 0.0f; + + int32_t total_deg = get_total_degree(src, nt); + if (total_deg == 0) continue; + + float res_per_nbr = one_minus_alpha_ * res / static_cast(total_deg); + + for (int32_t eid : node_type_to_edge_type_ids_[nt]) { + // fetched and neighbor_cache are mutually exclusive per iteration: + // drain_queue only adds a node to nodes_to_lookup when absent from + // neighbor_cache, so a given key appears in at most one of the two. + const std::vector* nbr_list = nullptr; + auto fi = fetched.find(pack_key(src, eid)); + if (fi != fetched.end()) { + nbr_list = &fi->second; + } else { + auto ci = neighbor_cache_.find(pack_key(src, eid)); + if (ci != neighbor_cache_.end()) nbr_list = &ci->second; + } + if (!nbr_list || nbr_list->empty()) continue; + + int32_t dst_nt = edge_type_to_dst_ntype_id_[eid]; + + for (int32_t nbr : *nbr_list) { + residuals_[s][dst_nt][nbr] += res_per_nbr; + + float threshold = requeue_threshold_factor_ * + static_cast(get_total_degree(nbr, dst_nt)); + + if (queue_[s][dst_nt].find(nbr) == queue_[s][dst_nt].end() && + residuals_[s][dst_nt][nbr] >= threshold) { + queue_[s][dst_nt].insert(nbr); + ++num_nodes_in_queue_; + + // Promote this node's neighbor lists to the persistent cache: + // it will be processed next iteration, so caching now avoids + // a re-fetch. Nodes that are never requeued (typically + // high-degree) are never promoted, keeping their large neighbor + // lists out of the cache. + for (int32_t peid : node_type_to_edge_type_ids_[dst_nt]) { + uint64_t pk = pack_key(nbr, peid); + if (neighbor_cache_.find(pk) == neighbor_cache_.end()) { + auto pfi = fetched.find(pk); + if (pfi != fetched.end()) + neighbor_cache_[pk] = pfi->second; + } + } + } + } + } + } + } + } + } + + // Extract top-k PPR nodes per seed per node type. + // Returns {ntype_id: (flat_ids_tensor, flat_weights_tensor, valid_counts_tensor)}. + // Only node types that received any PPR score are included in the output. + py::dict extract_top_k(int32_t max_ppr_nodes) { + std::unordered_set active; + for (int32_t s = 0; s < batch_size_; ++s) + for (int32_t nt = 0; nt < num_node_types_; ++nt) + if (!ppr_scores_[s][nt].empty()) active.insert(nt); + + py::dict result; + for (int32_t nt : active) { + std::vector flat_ids; + std::vector flat_weights; + std::vector valid_counts; + + for (int32_t s = 0; s < batch_size_; ++s) { + const auto& scores = ppr_scores_[s][nt]; + int32_t k = std::min(max_ppr_nodes, static_cast(scores.size())); + if (k > 0) { + std::vector> items(scores.begin(), scores.end()); + std::partial_sort(items.begin(), items.begin() + k, items.end(), + [](const auto& a, const auto& b) { return a.second > b.second; }); + for (int32_t i = 0; i < k; ++i) { + flat_ids.push_back(static_cast(items[i].first)); + flat_weights.push_back(items[i].second); + } + } + valid_counts.push_back(static_cast(k)); + } + + result[py::int_(nt)] = py::make_tuple( + torch::tensor(flat_ids, torch::kLong), + torch::tensor(flat_weights, torch::kFloat), + torch::tensor(valid_counts, torch::kLong) + ); + } + return result; + } + +private: + int32_t get_total_degree(int32_t node_id, int32_t ntype_id) const { + if (ntype_id >= static_cast(degree_tensors_.size())) return 0; + const auto& t = degree_tensors_[ntype_id]; + if (t.numel() == 0) return 0; // destination-only type: no outgoing edges + TORCH_CHECK( + node_id < static_cast(t.size(0)), + "Node ID ", node_id, " out of range for degree tensor of ntype_id ", ntype_id, + " (size=", t.size(0), "). This indicates corrupted graph data or a sampler bug." + ); + return t.data_ptr()[node_id]; + } + + float alpha_, one_minus_alpha_, requeue_threshold_factor_; + int32_t batch_size_, num_node_types_, num_nodes_in_queue_{0}; + + std::vector> node_type_to_edge_type_ids_; + std::vector edge_type_to_dst_ntype_id_; + std::vector degree_tensors_; + + // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id]). + std::vector>> ppr_scores_; + std::vector>> residuals_; + std::vector>> queue_; + // Snapshot of queue contents from the last drain_queue() call, used by push_residuals(). + std::vector>> queued_nodes_; + + // Persistent neighbor cache: pack_key(node_id, etype_id) -> neighbor list. + // Only nodes that have been requeued (and thus will be processed again) are + // promoted here from the per-iteration fetched map. + std::unordered_map> neighbor_cache_; +}; + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + py::class_(m, "PPRForwardPushState") + .def(py::init< + torch::Tensor, + int32_t, + float, float, + std::vector>, + std::vector, + std::vector + >()) + .def("drain_queue", &PPRForwardPushState::drain_queue) + .def("push_residuals", &PPRForwardPushState::push_residuals) + .def("extract_top_k", &PPRForwardPushState::extract_top_k); +} diff --git a/gigl/distributed/cpp_extensions/ppr_forward_push.pyi b/gigl/distributed/cpp_extensions/ppr_forward_push.pyi new file mode 100644 index 000000000..265468c3c --- /dev/null +++ b/gigl/distributed/cpp_extensions/ppr_forward_push.pyi @@ -0,0 +1,21 @@ +import torch + +class PPRForwardPushState: + def __init__( + self, + seed_nodes: torch.Tensor, + seed_node_type_id: int, + alpha: float, + requeue_threshold_factor: float, + node_type_to_edge_type_ids: list[list[int]], + edge_type_to_dst_ntype_id: list[int], + degree_tensors: list[torch.Tensor], + ) -> None: ... + def drain_queue(self) -> dict[int, torch.Tensor] | None: ... + def push_residuals( + self, + fetched_by_etype_id: dict[int, tuple[torch.Tensor, torch.Tensor, torch.Tensor]], + ) -> None: ... + def extract_top_k( + self, max_ppr_nodes: int + ) -> dict[int, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]: ... diff --git a/gigl/distributed/dist_ppr_sampler.py b/gigl/distributed/dist_ppr_sampler.py index 17673a72d..cf4c732c0 100644 --- a/gigl/distributed/dist_ppr_sampler.py +++ b/gigl/distributed/dist_ppr_sampler.py @@ -1,14 +1,6 @@ -# TODO (mkolodner-sc): The forward push loop in _compute_ppr_scores is the -# main throughput bottleneck — both the queue drain (preparing batched node -# lookups by edge type) and the residual push/requeue pass are pure Python -# dict/set operations in tight nested loops. Moving these to a C++ extension -# (e.g. pybind11) would eliminate per-operation Python overhead and enable -# cache-friendly memory access patterns. - # TODO (mkolodner-sc): Investigate whether concurrency for _sample_one_hop and _compute_ppr_scores will # yield performance benefits. -import heapq from collections import defaultdict from typing import Optional, Union @@ -22,6 +14,7 @@ from graphlearn_torch.typing import EdgeType, NodeType from graphlearn_torch.utils import merge_dict +from gigl.distributed.cpp_extensions import PPRForwardPushState from gigl.distributed.dist_neighbor_sampler import DistNeighborSampler from gigl.types.graph import is_label_edge_type @@ -48,6 +41,49 @@ ) +def _group_fetched_by_etype_id( + fetched: dict[tuple[int, EdgeType], list[int]], + etype_to_etype_id: dict[EdgeType, int], +) -> dict[int, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]: + """Group batch-fetched neighbors by integer edge type ID for the C++ push kernel. + + Performs one linear pass over the fetched dict, building flat tensors per + edge type. This avoids per-neighbor Python overhead in push_residuals by + batching all lookups for the same edge type together. + + Args: + fetched: Output of _batch_fetch_neighbors: ``(node_id, etype)`` → + neighbor list. + etype_to_etype_id: Mapping from EdgeType to its integer ID. + + Returns: + Dict mapping etype_id to ``(node_ids, flat_neighbors, counts)`` as + flat int64 tensors. ``flat_neighbors`` is the concatenation of all + neighbor lists for that edge type; ``counts[i]`` gives the neighbor + count for ``node_ids[i]``. + """ + node_ids_by_etype: dict[int, list[int]] = {} + flat_nbrs_by_etype: dict[int, list[int]] = {} + counts_by_etype: dict[int, list[int]] = {} + for (node_id, etype), neighbors in fetched.items(): + eid = etype_to_etype_id[etype] + if eid not in node_ids_by_etype: + node_ids_by_etype[eid] = [] + flat_nbrs_by_etype[eid] = [] + counts_by_etype[eid] = [] + node_ids_by_etype[eid].append(node_id) + flat_nbrs_by_etype[eid].extend(neighbors) + counts_by_etype[eid].append(len(neighbors)) + return { + eid: ( + torch.tensor(node_ids_by_etype[eid], dtype=torch.long), + torch.tensor(flat_nbrs_by_etype[eid], dtype=torch.long), + torch.tensor(counts_by_etype[eid], dtype=torch.long), + ) + for eid in node_ids_by_etype + } + + # TODO (mkolodner-sc): Consider introducing a BaseGiGLSampler that owns # shared utilities like _prepare_sample_loop_inputs, with KHopSampler and # PPRSampler as siblings. Currently DistPPRNeighborSampler inherits from @@ -157,6 +193,56 @@ def __init__( NodeType, torch.Tensor ] = self._build_total_degree_tensors(degree_tensors, total_degree_dtype) + # Build integer ID mappings for the C++ forward-push kernel. String + # NodeType / EdgeType keys are only used at the Python boundary + # (translating to/from _batch_fetch_neighbors); all hot-loop state inside + # PPRForwardPushState is indexed by int32 IDs. + # + # We include both source types (have outgoing edges) and destination-only + # types (no outgoing edges, but may accumulate PPR score during the walk) + # so the kernel can index residual/ppr_score tables for any node it sees. + _all_node_types: list[NodeType] = sorted( + {nt for nt in self._node_type_to_edge_types} + | { + self._get_destination_type(et) + for etypes in self._node_type_to_edge_types.values() + for et in etypes + } + ) + # dict.fromkeys preserves insertion order while deduplicating. + _all_edge_types: list[EdgeType] = list( + dict.fromkeys( + et for etypes in self._node_type_to_edge_types.values() for et in etypes + ) + ) + + self._node_type_to_id: dict[NodeType, int] = { + nt: i for i, nt in enumerate(_all_node_types) + } + self._ntype_id_to_ntype: list[NodeType] = _all_node_types + self._etype_to_etype_id: dict[EdgeType, int] = { + et: i for i, et in enumerate(_all_edge_types) + } + self._etype_id_to_etype: list[EdgeType] = _all_edge_types + + self._node_type_id_to_edge_type_ids: list[list[int]] = [ + [ + self._etype_to_etype_id[et] + for et in self._node_type_to_edge_types.get(nt, []) + ] + for nt in _all_node_types + ] + self._edge_type_id_to_dst_ntype_id: list[int] = [ + self._node_type_to_id[self._get_destination_type(et)] + for et in _all_edge_types + ] + # Degree tensors indexed by ntype_id. Destination-only types get an empty + # tensor; the C++ kernel returns 0 for those, matching _get_total_degree. + self._degree_tensors_for_cpp: list[torch.Tensor] = [ + self._node_type_to_total_degree.get(nt, torch.zeros(0, dtype=torch.int32)) + for nt in _all_node_types + ] + def _build_total_degree_tensors( self, degree_tensors: Union[torch.Tensor, dict[EdgeType, torch.Tensor]], @@ -209,35 +295,6 @@ def _build_total_degree_tensors( return result - def _get_total_degree(self, node_id: int, node_type: NodeType) -> int: - """Look up the precomputed total degree of a node. - - Args: - node_id: The ID of the node to look up. - node_type: The node type. - - Returns: - The total degree (sum across all edge types) for the node. - - Raises: - ValueError: If the node ID is out of range, indicating corrupted - graph data or a sampler bug. - """ - # Destination-only node types (no outgoing edges) are absent from - # _node_type_to_total_degree because total degree is only computed for - # traversable source types. Returning 0 here is correct: such nodes - # act as terminals — they accumulate PPR score but never push residual - # further. - if node_type not in self._node_type_to_total_degree: - return 0 - degree_tensor = self._node_type_to_total_degree[node_type] - if node_id >= len(degree_tensor): - raise ValueError( - f"Node ID {node_id} exceeds total degree tensor length " - f"({len(degree_tensor)}) for node type {node_type}." - ) - return int(degree_tensor[node_id].item()) - def _get_destination_type(self, edge_type: EdgeType) -> NodeType: """Get the node type at the destination end of an edge type.""" return edge_type[0] if self.edge_dir == "in" else edge_type[-1] @@ -369,226 +426,59 @@ async def _compute_ppr_scores( if seed_node_type is None: seed_node_type = _PPR_HOMOGENEOUS_NODE_TYPE device = seed_nodes.device - batch_size = seed_nodes.size(0) - - # Per-seed PPR state, nested by node type for efficient type-grouped access. - - # ppr_scores[i][node_type][node_id] = accumulated PPR score for node_id - # of type node_type, relative to seed i. Updated each iteration by - # absorbing the node's residual. - ppr_scores: list[dict[NodeType, dict[int, float]]] = [ - defaultdict(lambda: defaultdict(float)) for _ in range(batch_size) - ] - # residuals[i][node_type][node_id] = unconverged probability mass at node_id - # of type node_type for seed i. Each iteration, a node's residual is - # absorbed into its PPR score and then distributed to its neighbors. - residuals: list[dict[NodeType, dict[int, float]]] = [ - defaultdict(lambda: defaultdict(float)) for _ in range(batch_size) - ] - - # queue[i][node_type] = set of node IDs whose residual exceeds the - # convergence threshold (alpha * eps * total_degree). The algorithm - # terminates when all queues are empty. A set is used because multiple - # neighbors can push residual to the same node in one iteration — - # deduplication avoids redundant processing, and the O(1) membership - # check matters since it runs in the innermost loop. - queue: list[dict[NodeType, set[int]]] = [ - defaultdict(set) for _ in range(batch_size) - ] - - seed_list = seed_nodes.tolist() - - for i, seed in enumerate(seed_list): - residuals[i][seed_node_type][seed] = self._alpha - queue[i][seed_node_type].add(seed) - - # Cache keyed by (node_id, edge_type) since same node can have different neighbors per edge type - neighbor_cache: dict[tuple[int, EdgeType], list[int]] = {} - - num_nodes_in_queue = batch_size - one_minus_alpha = 1 - self._alpha - - while num_nodes_in_queue > 0: - # Drain all nodes from all queues and group by edge type for batched lookups - queued_nodes: list[dict[NodeType, set[int]]] = [ - defaultdict(set) for _ in range(batch_size) - ] - nodes_to_lookup: dict[EdgeType, set[int]] = defaultdict(set) - - for seed_idx in range(batch_size): - if queue[seed_idx]: - queued_nodes[seed_idx] = queue[seed_idx] - queue[seed_idx] = defaultdict(set) - for node_type, node_ids in queued_nodes[seed_idx].items(): - num_nodes_in_queue -= len(node_ids) - # We fetch neighbors for ALL edge types originating - # from this node type, not just the edge type that - # caused the node to be queued. This is required for - # correctness: forward push distributes residual to - # all neighbors proportionally by total degree, so - # every edge type must be considered. - # Destination-only types have no entry in _node_type_to_edge_types; - # .get() returns [] so we skip neighbor lookup for them. - edge_types_for_node = self._node_type_to_edge_types.get( - node_type, [] - ) - for node_id in node_ids: - for etype in edge_types_for_node: - cache_key = (node_id, etype) - if cache_key not in neighbor_cache: - # TODO (mkolodner-sc): Investigate switching from set to list - # here. _sample_one_hop handles duplicates correctly (second - # write to result[(node_id, etype)] is a no-op overwrite), so - # dedup is not required for correctness. A list would avoid - # per-add hash cost and the set->list->tensor conversion in - # _batch_fetch_neighbors, though at the cost of redundant - # network calls for any duplicate nodes across seeds. - nodes_to_lookup[etype].add(node_id) - - fetched_neighbors = await self._batch_fetch_neighbors( - nodes_to_lookup=nodes_to_lookup, - device=device, - ) - # fetched_neighbors is intentionally NOT merged into neighbor_cache - # upfront. We only promote entries when a node is requeued — see - # the should_requeue block below. - - # Push residual to neighbors and re-queue in a single pass. This - # is safe because each seed's state is independent, and residuals - # are always positive so the merged loop can never miss a re-queue. - for seed_idx in range(batch_size): - for source_type, source_nodes in queued_nodes[seed_idx].items(): - for source_node in source_nodes: - source_residual = residuals[seed_idx][source_type].get( - source_node, 0.0 - ) - - ppr_scores[seed_idx][source_type][ - source_node - ] += source_residual - residuals[seed_idx][source_type][source_node] = 0.0 - - # Same destination-only guard as in the queue drain loop above. - edge_types_for_node = self._node_type_to_edge_types.get( - source_type, [] - ) - - total_degree = self._get_total_degree(source_node, source_type) - - if total_degree == 0: - continue + ppr_state = PPRForwardPushState( + seed_nodes, + self._node_type_to_id[seed_node_type], + self._alpha, + self._requeue_threshold_factor, + self._node_type_id_to_edge_type_ids, + self._edge_type_id_to_dst_ntype_id, + self._degree_tensors_for_cpp, + ) - residual_per_neighbor = ( - one_minus_alpha * source_residual / total_degree - ) + while True: + # drain_queue returns None when the queue is truly empty (convergence), + # or a dict (possibly empty) when nodes were drained. An empty dict + # means all drained nodes either had cached neighbors or no outgoing + # edges — we still call push_residuals to flush their residuals into + # ppr_scores_. + drain_result: dict[int, torch.Tensor] | None = ppr_state.drain_queue() + if drain_result is None: + break + + nodes_by_etype_id: dict[int, torch.Tensor] = drain_result + if nodes_by_etype_id: + # Translate integer etype IDs back to EdgeType for the distributed + # fetch layer. O(num_active_etypes) — negligible vs. RPC round-trip. + nodes_to_lookup: dict[EdgeType, set[int]] = { + self._etype_id_to_etype[eid]: set(t.tolist()) + for eid, t in nodes_by_etype_id.items() + } + fetched_neighbors = await self._batch_fetch_neighbors( + nodes_to_lookup, device + ) + fetched_by_etype_id = _group_fetched_by_etype_id( + fetched_neighbors, self._etype_to_etype_id + ) + else: + fetched_by_etype_id = {} - for etype in edge_types_for_node: - cache_key = (source_node, etype) - # fetched_neighbors and neighbor_cache are mutually - # exclusive per iteration: the queue drain only adds - # a node to nodes_to_lookup if it is absent from - # neighbor_cache, so a key appears in at most one. - neighbor_list = fetched_neighbors.get( - cache_key, neighbor_cache.get(cache_key, []) - ) - if not neighbor_list: - continue - - neighbor_type = self._get_destination_type(etype) - - for neighbor_node in neighbor_list: - residuals[seed_idx][neighbor_type][ - neighbor_node - ] += residual_per_neighbor - - requeue_threshold = ( - self._requeue_threshold_factor - * self._get_total_degree( - neighbor_node, neighbor_type - ) - ) - should_requeue = ( - neighbor_node not in queue[seed_idx][neighbor_type] - and residuals[seed_idx][neighbor_type][ - neighbor_node - ] - >= requeue_threshold - ) - if should_requeue: - queue[seed_idx][neighbor_type].add(neighbor_node) - num_nodes_in_queue += 1 - # Promote this node's neighbor lists to the - # persistent cache: it will be processed next - # iteration, so caching now avoids a re-fetch. - # Nodes that are never requeued (typically - # high-degree) are never promoted, keeping - # their large neighbor lists out of the cache. - for ( - promote_etype - ) in self._node_type_to_edge_types.get( - neighbor_type, [] - ): - promote_key = (neighbor_node, promote_etype) - if ( - promote_key in fetched_neighbors - and promote_key not in neighbor_cache - ): - neighbor_cache[ - promote_key - ] = fetched_neighbors[promote_key] - - # Extract top-k nodes by PPR score, grouped by node type. - # Results are three flat tensors per node type (no padding): - # - flat_ids: [id_seed0_0, id_seed0_1, ..., id_seed1_0, ...] - # - flat_weights: [wt_seed0_0, wt_seed0_1, ..., wt_seed1_0, ...] - # - valid_counts: [count_seed0, count_seed1, ...] - # - # valid_counts[i] records how many top-k neighbors seed i contributed. - # The inducer uses valid_counts to slice flat_ids into per-seed groups - # and assign local indices. Example: - # - # 4 seeds, valid_counts = [1, 6, 2, 1] (10 total pairs) - # flat_ids = [d0a, d1a, d1b, d1c, d1d, d1e, d1f, d2a, d2b, d3a] - # - # seed 0 owns flat_ids[0:1], seed 1 owns flat_ids[1:7], - # seed 2 owns flat_ids[7:9], seed 3 owns flat_ids[9:10] - # _node_type_to_edge_types only contains source types; destination-only - # types are absent but may have accumulated PPR scores during the walk. - # We union with all types seen in ppr_scores so they appear in the output. - all_node_types: set[NodeType] = set(self._node_type_to_edge_types.keys()) - for seed_ppr in ppr_scores: - all_node_types.update(seed_ppr.keys()) + ppr_state.push_residuals(fetched_by_etype_id) + # Translate ntype_id integer keys back to NodeType strings for the rest + # of the pipeline, and move tensors to the correct device. ntype_to_flat_ids: dict[NodeType, torch.Tensor] = {} ntype_to_flat_weights: dict[NodeType, torch.Tensor] = {} ntype_to_valid_counts: dict[NodeType, torch.Tensor] = {} - for ntype in all_node_types: - flat_ids: list[int] = [] - flat_weights: list[float] = [] - valid_counts: list[int] = [] - - for i in range(batch_size): - type_scores = ppr_scores[i].get(ntype, {}) - top_k = heapq.nlargest( - self._max_ppr_nodes, type_scores.items(), key=lambda x: x[1] - ) - if top_k: - ids, weights = zip(*top_k) - flat_ids.extend(ids) - flat_weights.extend(weights) - valid_counts.append(len(top_k)) - - ntype_to_flat_ids[ntype] = torch.tensor( - flat_ids, dtype=torch.long, device=device - ) - ntype_to_flat_weights[ntype] = torch.tensor( - flat_weights, dtype=torch.float, device=device - ) - ntype_to_valid_counts[ntype] = torch.tensor( - valid_counts, dtype=torch.long, device=device - ) + for ntype_id, (flat_ids, flat_weights, valid_counts) in ppr_state.extract_top_k( + self._max_ppr_nodes + ).items(): + ntype = self._ntype_id_to_ntype[ntype_id] + ntype_to_flat_ids[ntype] = flat_ids.to(device) + ntype_to_flat_weights[ntype] = flat_weights.to(device) + ntype_to_valid_counts[ntype] = valid_counts.to(device) if self._is_homogeneous: assert ( From a23179686c7025a9e8428267743a036f0b532e63 Mon Sep 17 00:00:00 2001 From: mkolodner Date: Tue, 24 Mar 2026 21:07:02 +0000 Subject: [PATCH 02/14] small precision fix --- .../cpp_extensions/ppr_forward_push.cpp | 35 ++++++++++--------- .../unit/distributed/dist_ppr_sampler_test.py | 13 ++++--- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp index 6f6d10545..e50373dcd 100644 --- a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp +++ b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp @@ -33,14 +33,14 @@ class PPRForwardPushState { PPRForwardPushState( torch::Tensor seed_nodes, int32_t seed_node_type_id, - float alpha, - float requeue_threshold_factor, + double alpha, + double requeue_threshold_factor, std::vector> node_type_to_edge_type_ids, std::vector edge_type_to_dst_ntype_id, std::vector degree_tensors ) : alpha_(alpha), - one_minus_alpha_(1.0f - alpha), + one_minus_alpha_(1.0 - alpha), requeue_threshold_factor_(requeue_threshold_factor), node_type_to_edge_type_ids_(std::move(node_type_to_edge_type_ids)), edge_type_to_dst_ntype_id_(std::move(edge_type_to_dst_ntype_id)), @@ -50,8 +50,8 @@ class PPRForwardPushState { batch_size_ = static_cast(seed_nodes.size(0)); num_node_types_ = static_cast(node_type_to_edge_type_ids_.size()); - ppr_scores_.assign(batch_size_, std::vector>(num_node_types_)); - residuals_.assign(batch_size_, std::vector>(num_node_types_)); + ppr_scores_.assign(batch_size_, std::vector>(num_node_types_)); + residuals_.assign(batch_size_, std::vector>(num_node_types_)); queue_.assign(batch_size_, std::vector>(num_node_types_)); queued_nodes_.assign(batch_size_, std::vector>(num_node_types_)); @@ -148,15 +148,15 @@ class PPRForwardPushState { for (int32_t src : queued_nodes_[s][nt]) { auto& src_res = residuals_[s][nt]; auto it = src_res.find(src); - float res = (it != src_res.end()) ? it->second : 0.0f; + double res = (it != src_res.end()) ? it->second : 0.0; ppr_scores_[s][nt][src] += res; - src_res[src] = 0.0f; + src_res[src] = 0.0; int32_t total_deg = get_total_degree(src, nt); if (total_deg == 0) continue; - float res_per_nbr = one_minus_alpha_ * res / static_cast(total_deg); + double res_per_nbr = one_minus_alpha_ * res / static_cast(total_deg); for (int32_t eid : node_type_to_edge_type_ids_[nt]) { // fetched and neighbor_cache are mutually exclusive per iteration: @@ -177,8 +177,8 @@ class PPRForwardPushState { for (int32_t nbr : *nbr_list) { residuals_[s][dst_nt][nbr] += res_per_nbr; - float threshold = requeue_threshold_factor_ * - static_cast(get_total_degree(nbr, dst_nt)); + double threshold = requeue_threshold_factor_ * + static_cast(get_total_degree(nbr, dst_nt)); if (queue_[s][dst_nt].find(nbr) == queue_[s][dst_nt].end() && residuals_[s][dst_nt][nbr] >= threshold) { @@ -225,12 +225,12 @@ class PPRForwardPushState { const auto& scores = ppr_scores_[s][nt]; int32_t k = std::min(max_ppr_nodes, static_cast(scores.size())); if (k > 0) { - std::vector> items(scores.begin(), scores.end()); + std::vector> items(scores.begin(), scores.end()); std::partial_sort(items.begin(), items.begin() + k, items.end(), [](const auto& a, const auto& b) { return a.second > b.second; }); for (int32_t i = 0; i < k; ++i) { flat_ids.push_back(static_cast(items[i].first)); - flat_weights.push_back(items[i].second); + flat_weights.push_back(static_cast(items[i].second)); } } valid_counts.push_back(static_cast(k)); @@ -258,7 +258,7 @@ class PPRForwardPushState { return t.data_ptr()[node_id]; } - float alpha_, one_minus_alpha_, requeue_threshold_factor_; + double alpha_, one_minus_alpha_, requeue_threshold_factor_; int32_t batch_size_, num_node_types_, num_nodes_in_queue_{0}; std::vector> node_type_to_edge_type_ids_; @@ -266,8 +266,11 @@ class PPRForwardPushState { std::vector degree_tensors_; // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id]). - std::vector>> ppr_scores_; - std::vector>> residuals_; + // double precision avoids float32 rounding errors accumulating over 20-30 + // push iterations, which would otherwise cause ~1e-4 score errors vs the + // true PPR. Output weights are cast to float32 in extract_top_k. + std::vector>> ppr_scores_; + std::vector>> residuals_; std::vector>> queue_; // Snapshot of queue contents from the last drain_queue() call, used by push_residuals(). std::vector>> queued_nodes_; @@ -283,7 +286,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { .def(py::init< torch::Tensor, int32_t, - float, float, + double, double, std::vector>, std::vector, std::vector diff --git a/tests/unit/distributed/dist_ppr_sampler_test.py b/tests/unit/distributed/dist_ppr_sampler_test.py index 5ad96e66e..a4f6d160b 100644 --- a/tests/unit/distributed/dist_ppr_sampler_test.py +++ b/tests/unit/distributed/dist_ppr_sampler_test.py @@ -270,8 +270,9 @@ def _assert_ppr_scores_match_reference( """Assert sampler PPR scores match reference scores per node type. Checks that top-k node sets are identical and that per-node scores - are within atol=1e-6. The forward push error per node is bounded by - O(alpha * eps * degree); observed deltas are ~1e-7 for eps=1e-6. + are within atol=1e-5. The forward push error per node is bounded by + O(alpha * eps * degree); for max degree 3, alpha=0.5, eps=1e-6 the + theoretical bound is ~1.5e-6, so 1e-5 provides a safety margin. Args: ntype_to_sampler_ppr: Sampler output from :func:`_extract_hetero_ppr_scores`. @@ -290,7 +291,7 @@ def _assert_ppr_scores_match_reference( for node_id in reference_ppr[ntype_str]: ref_score = reference_ppr[ntype_str][node_id] sam_score = ntype_to_sampler_ppr[ntype_str][node_id] - assert abs(sam_score - ref_score) < 1e-6, ( + assert abs(sam_score - ref_score) < 1e-5, ( f"{seed_id}, type {ntype_str}, node {node_id}: " f"sampler={sam_score:.8f} vs reference={ref_score:.8f}" ) @@ -372,11 +373,13 @@ def _run_ppr_loader_correctness_check( ) # Forward push is an approximation; with eps=1e-6 the per-node error - # is bounded by O(alpha * eps * degree). Observed deltas are ~1e-7. + # is bounded by O(alpha * eps * degree). For this test graph + # (max degree 3, alpha=0.5, eps=1e-6) the theoretical bound is ~1.5e-6. + # Tolerance is set to 1e-5 to provide a safety margin above that bound. for node_id in reference_ppr: ref_score = reference_ppr[node_id] sam_score = sampler_ppr[node_id] - assert abs(sam_score - ref_score) < 1e-6, ( + assert abs(sam_score - ref_score) < 1e-5, ( f"Seed {seed_global_id}, node {node_id}: " f"sampler={sam_score:.8f} vs reference={ref_score:.8f}" ) From a19db887809278de0de5427be033798031ddb41b Mon Sep 17 00:00:00 2001 From: mkolodner Date: Tue, 24 Mar 2026 21:36:32 +0000 Subject: [PATCH 03/14] Optimize --- gigl/distributed/dist_ppr_sampler.py | 118 ++++++--------------------- 1 file changed, 24 insertions(+), 94 deletions(-) diff --git a/gigl/distributed/dist_ppr_sampler.py b/gigl/distributed/dist_ppr_sampler.py index cf4c732c0..b63329357 100644 --- a/gigl/distributed/dist_ppr_sampler.py +++ b/gigl/distributed/dist_ppr_sampler.py @@ -41,49 +41,6 @@ ) -def _group_fetched_by_etype_id( - fetched: dict[tuple[int, EdgeType], list[int]], - etype_to_etype_id: dict[EdgeType, int], -) -> dict[int, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]: - """Group batch-fetched neighbors by integer edge type ID for the C++ push kernel. - - Performs one linear pass over the fetched dict, building flat tensors per - edge type. This avoids per-neighbor Python overhead in push_residuals by - batching all lookups for the same edge type together. - - Args: - fetched: Output of _batch_fetch_neighbors: ``(node_id, etype)`` → - neighbor list. - etype_to_etype_id: Mapping from EdgeType to its integer ID. - - Returns: - Dict mapping etype_id to ``(node_ids, flat_neighbors, counts)`` as - flat int64 tensors. ``flat_neighbors`` is the concatenation of all - neighbor lists for that edge type; ``counts[i]`` gives the neighbor - count for ``node_ids[i]``. - """ - node_ids_by_etype: dict[int, list[int]] = {} - flat_nbrs_by_etype: dict[int, list[int]] = {} - counts_by_etype: dict[int, list[int]] = {} - for (node_id, etype), neighbors in fetched.items(): - eid = etype_to_etype_id[etype] - if eid not in node_ids_by_etype: - node_ids_by_etype[eid] = [] - flat_nbrs_by_etype[eid] = [] - counts_by_etype[eid] = [] - node_ids_by_etype[eid].append(node_id) - flat_nbrs_by_etype[eid].extend(neighbors) - counts_by_etype[eid].append(len(neighbors)) - return { - eid: ( - torch.tensor(node_ids_by_etype[eid], dtype=torch.long), - torch.tensor(flat_nbrs_by_etype[eid], dtype=torch.long), - torch.tensor(counts_by_etype[eid], dtype=torch.long), - ) - for eid in node_ids_by_etype - } - - # TODO (mkolodner-sc): Consider introducing a BaseGiGLSampler that owns # shared utilities like _prepare_sample_loop_inputs, with KHopSampler and # PPRSampler as siblings. Currently DistPPRNeighborSampler inherits from @@ -195,7 +152,7 @@ def __init__( # Build integer ID mappings for the C++ forward-push kernel. String # NodeType / EdgeType keys are only used at the Python boundary - # (translating to/from _batch_fetch_neighbors); all hot-loop state inside + # (translating to/from _sample_one_hop); all hot-loop state inside # PPRForwardPushState is indexed by int32 IDs. # # We include both source types (have outgoing edges) and destination-only @@ -301,68 +258,50 @@ def _get_destination_type(self, edge_type: EdgeType) -> NodeType: async def _batch_fetch_neighbors( self, - nodes_to_lookup: dict[EdgeType, set[int]], + nodes_by_etype_id: dict[int, torch.Tensor], device: torch.device, - ) -> dict[tuple[int, EdgeType], list[int]]: - """Batch fetch neighbors for nodes grouped by edge type. + ) -> dict[int, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]: + """Batch fetch neighbors for nodes grouped by integer edge type ID. Issues one ``_sample_one_hop`` call per edge type (not per node), so all nodes of the same edge type are fetched in a single RPC round-trip. Each node's neighbor list is capped at ``self._num_neighbors_per_hop``. Args: - nodes_to_lookup: Dict mapping each edge type to the set of node IDs - whose neighbors should be fetched via that edge type. Only nodes - absent from the caller's ``neighbor_cache`` should be included. + nodes_by_etype_id: Dict mapping integer edge type ID to a 1-D int64 + tensor of node IDs to fetch neighbors for. Comes directly from + ``drain_queue()``; node IDs are already deduplicated. device: Torch device for intermediate tensor creation. Returns: - Dict mapping ``(node_id, edge_type)`` to the list of neighbor node IDs - returned by ``_sample_one_hop``. Only nodes that appeared in - ``nodes_to_lookup`` are present; edge types with an empty node set are - skipped entirely. + Dict mapping etype_id to ``(node_ids, flat_neighbors, counts)`` as + int64 tensors, ready to pass directly to ``push_residuals``. + ``flat_neighbors`` is the flat concatenation of all neighbor lists + for that edge type; ``counts[i]`` is the neighbor count for + ``node_ids[i]``. Example:: - nodes_to_lookup = { - ("user", "buys", "item"): {0, 3}, - ("item", "bought_by", "user"): {7}, + nodes_by_etype_id = { + 2: tensor([0, 3]), # etype_id 2 → nodes 0 and 3 + 5: tensor([7]), # etype_id 5 → node 7 } # Might return (neighbor lists depend on graph structure): { - (0, ("user", "buys", "item")): [5, 9, 2], - (3, ("user", "buys", "item")): [1], - (7, ("item", "bought_by", "user")): [0, 3], + 2: (tensor([0, 3]), tensor([5, 9, 2, 1]), tensor([3, 1])), + 5: (tensor([7]), tensor([0, 3]), tensor([2])), } """ - result: dict[tuple[int, EdgeType], list[int]] = {} - for etype, node_ids in nodes_to_lookup.items(): - if not node_ids: - continue - nodes_list = list(node_ids) - lookup_tensor = torch.tensor(nodes_list, dtype=torch.long, device=device) - + result: dict[int, tuple[torch.Tensor, torch.Tensor, torch.Tensor]] = {} + for eid, node_ids_tensor in nodes_by_etype_id.items(): + etype = self._etype_id_to_etype[eid] # _sample_one_hop expects None for homogeneous graphs, not the PPR sentinel. output: NeighborOutput = await self._sample_one_hop( - srcs=lookup_tensor, + srcs=node_ids_tensor.to(device), num_nbr=self._num_neighbors_per_hop, etype=etype if etype != _PPR_HOMOGENEOUS_EDGE_TYPE else None, ) - neighbors = output.nbr - neighbor_counts = output.nbr_num - - # TODO (mkolodner-sc): Investigate performance of a vectorized version of the below code - neighbors_list = neighbors.tolist() - counts_list = neighbor_counts.tolist() - del neighbors, neighbor_counts - - # neighbors_list is a flat concatenation of all neighbors for all looked-up nodes. - # We use offset to slice out each node's neighbors: node i's neighbors are at - # neighbors_list[offset : offset + count], then we advance offset by count. - offset = 0 - for node_id, count in zip(nodes_list, counts_list): - result[(node_id, etype)] = neighbors_list[offset : offset + count] - offset += count + result[eid] = (node_ids_tensor, output.nbr, output.nbr_num) return result @@ -449,17 +388,8 @@ async def _compute_ppr_scores( nodes_by_etype_id: dict[int, torch.Tensor] = drain_result if nodes_by_etype_id: - # Translate integer etype IDs back to EdgeType for the distributed - # fetch layer. O(num_active_etypes) — negligible vs. RPC round-trip. - nodes_to_lookup: dict[EdgeType, set[int]] = { - self._etype_id_to_etype[eid]: set(t.tolist()) - for eid, t in nodes_by_etype_id.items() - } - fetched_neighbors = await self._batch_fetch_neighbors( - nodes_to_lookup, device - ) - fetched_by_etype_id = _group_fetched_by_etype_id( - fetched_neighbors, self._etype_to_etype_id + fetched_by_etype_id = await self._batch_fetch_neighbors( + nodes_by_etype_id, device ) else: fetched_by_etype_id = {} From fed381545fc7923d7377b69513480e615057a588 Mon Sep 17 00:00:00 2001 From: mkolodner Date: Tue, 24 Mar 2026 21:42:45 +0000 Subject: [PATCH 04/14] Add explanatory comments to ppr_forward_push.cpp for C++ newcomers --- .../cpp_extensions/ppr_forward_push.cpp | 244 +++++++++++++++--- 1 file changed, 203 insertions(+), 41 deletions(-) diff --git a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp index e50373dcd..e22ac264f 100644 --- a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp +++ b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp @@ -1,16 +1,30 @@ -#include -#include +#include // PyTorch C++ API (tensors, TORCH_CHECK) +#include // Automatic conversion between C++ containers and Python types -#include -#include -#include -#include -#include +#include // std::partial_sort, std::min +#include // Fixed-width integer types: int32_t, int64_t, uint32_t, uint64_t +#include // std::unordered_map — like Python dict, O(1) average lookup +#include // std::unordered_set — like Python set, O(1) average lookup +#include // std::vector — like Python list, contiguous in memory -namespace py = pybind11; +namespace py = pybind11; // Alias for the pybind11 namespace (bridges C++ ↔ Python) -// Pack (node_id, etype_id) into a single uint64_t lookup key. -// Requires both values fit in 32 bits — enforced by the Python caller. +// Combine (node_id, etype_id) into a single 64-bit integer for use as a hash +// map key. A single 64-bit integer is cheaper to hash than a pair of two +// integers (std::unordered_map has no built-in pair hash). +// +// Bit layout: +// bits 63–32: node_id (upper half) +// bits 31– 0: etype_id (lower half) +// +// Both inputs are cast through uint32_t before packing. Without this, a +// negative int32_t (e.g. -1 = 0xFFFFFFFF) would be sign-extended to a full +// 64-bit value, corrupting the upper bits when shifted. Reinterpreting as +// uint32_t first treats the bit pattern as-is (no sign extension). +// +// `static inline` means: define this function here in the translation unit +// (not in a separate object file) and ask the compiler to inline it at each +// call site instead of generating a function call. static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) { return (static_cast(static_cast(node_id)) << 32) | static_cast(etype_id); @@ -18,6 +32,10 @@ static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) { // C++ kernel for the PPR Forward Push algorithm (Andersen et al., 2006). // +// All hot-loop state (scores, residuals, queue, neighbor cache) lives inside +// this object. The distributed neighbor fetch is kept in Python because it +// involves async RPC calls that C++ cannot drive directly. +// // Owned state: ppr_scores, residuals, queue, queued_nodes, neighbor_cache. // Python retains ownership of: the distributed neighbor fetch (_batch_fetch_neighbors). // @@ -42,6 +60,9 @@ class PPRForwardPushState { : alpha_(alpha), one_minus_alpha_(1.0 - alpha), requeue_threshold_factor_(requeue_threshold_factor), + // std::move transfers ownership of each vector into the member variable + // without copying its contents — equivalent to Python's list hand-off + // when you no longer need the original. node_type_to_edge_type_ids_(std::move(node_type_to_edge_type_ids)), edge_type_to_dst_ntype_id_(std::move(edge_type_to_dst_ntype_id)), degree_tensors_(std::move(degree_tensors)) { @@ -50,15 +71,26 @@ class PPRForwardPushState { batch_size_ = static_cast(seed_nodes.size(0)); num_node_types_ = static_cast(node_type_to_edge_type_ids_.size()); - ppr_scores_.assign(batch_size_, std::vector>(num_node_types_)); - residuals_.assign(batch_size_, std::vector>(num_node_types_)); - queue_.assign(batch_size_, std::vector>(num_node_types_)); - queued_nodes_.assign(batch_size_, std::vector>(num_node_types_)); - + // Allocate per-seed, per-node-type tables. + // .assign(n, val) fills a vector with n copies of val — like [val] * n in Python. + // Each inner element is an empty hash map / hash set for that (seed, ntype) pair. + ppr_scores_.assign(batch_size_, std::vector>(num_node_types_)); + residuals_.assign(batch_size_, std::vector>(num_node_types_)); + queue_.assign(batch_size_, std::vector>(num_node_types_)); + queued_nodes_.assign(batch_size_, std::vector>(num_node_types_)); + + // accessor() returns a typed view into the tensor's data that + // supports [i] indexing with bounds checking in debug builds. Here we read + // each seed node ID from the 1-D int64 tensor. auto acc = seed_nodes.accessor(); num_nodes_in_queue_ = batch_size_; for (int32_t i = 0; i < batch_size_; ++i) { + // static_cast: explicit narrowing from int64 to int32. + // The Python caller guarantees node IDs fit in 32 bits. int32_t seed = static_cast(acc[i]); + // PPR initialisation: each seed starts with residual = alpha (the + // restart probability). The first push will move alpha into ppr_score + // and distribute (1-alpha)*alpha to the seed's neighbors. residuals_[i][seed_node_type_id][seed] = alpha_; queue_[i][seed_node_type_id].insert(seed); } @@ -68,32 +100,45 @@ class PPRForwardPushState { // neighbor lookup. Also snapshots the drained nodes into queued_nodes_ for // use by push_residuals(). // - // Returns None when the queue is truly empty (convergence signal). - // Returns a dict (possibly empty) when nodes were drained but all had cached - // neighbors or no outgoing edges — push_residuals must still be called to - // flush their residuals into ppr_scores_. + // Return value semantics (py::object can hold any Python value): + // - py::none() → queue was already empty; convergence achieved; stop the loop. + // - py::dict{} → nodes were drained. The dict maps etype_id → 1-D int64 + // tensor of node IDs that need neighbor lookups this round. + // May be empty if all drained nodes were already in the cache + // or had no outgoing edges — push_residuals must still be called + // to flush their accumulated residual into ppr_scores_. py::object drain_queue() { if (num_nodes_in_queue_ == 0) { return py::none(); } + // Reset the snapshot from the previous iteration. `auto&` is a reference + // (alias) to the existing set — clearing it modifies the original in-place + // rather than operating on a copy. for (int32_t s = 0; s < batch_size_; ++s) for (auto& qs : queued_nodes_[s]) qs.clear(); + // nodes_to_lookup[eid] = set of node IDs that need a neighbor fetch for + // edge type eid this round. Using a set deduplicates nodes that appear + // in multiple seeds' queues: we only fetch each (node, etype) pair once + // regardless of how many seeds need it. std::unordered_map> nodes_to_lookup; for (int32_t s = 0; s < batch_size_; ++s) { for (int32_t nt = 0; nt < num_node_types_; ++nt) { if (queue_[s][nt].empty()) continue; - // Snapshot queue into queued_nodes, then reset queue. + // Move the live queue into the snapshot (no data copy — O(1)). + // queue_ is then reset to an empty set so new entries added by + // push_residuals() in this same iteration don't interfere. queued_nodes_[s][nt] = std::move(queue_[s][nt]); queue_[s][nt].clear(); num_nodes_in_queue_ -= static_cast(queued_nodes_[s][nt].size()); for (int32_t node_id : queued_nodes_[s][nt]) { for (int32_t eid : node_type_to_edge_type_ids_[nt]) { - // Only add to lookup if not already in the persistent cache. + // Only request a fetch if the neighbor list isn't already + // cached from a previous iteration. if (neighbor_cache_.find(pack_key(node_id, eid)) == neighbor_cache_.end()) { nodes_to_lookup[eid].insert(node_id); } @@ -102,8 +147,13 @@ class PPRForwardPushState { } } + // Convert to Python: {etype_id (int) → 1-D int64 tensor of node IDs}. + // py::int_(eid) wraps a C++ int as a Python int so it can be used as a + // dict key on the Python side. py::dict result; for (auto& [eid, node_set] : nodes_to_lookup) { + // Copy the set into a vector first: torch::tensor() requires a + // contiguous sequence, not an unordered_set iterator. std::vector ids(node_set.begin(), node_set.end()); result[py::int_(eid)] = torch::tensor(ids, torch::kLong); } @@ -111,24 +161,38 @@ class PPRForwardPushState { } // Push residuals to neighbors given the fetched neighbor data. + // // fetched_by_etype_id: {etype_id: (node_ids_tensor, flat_nbrs_tensor, counts_tensor)} - // - node_ids_tensor: [N] int64 — source node IDs fetched for this edge type - // - flat_nbrs_tensor: [sum(counts)] int64 — flat concatenation of all neighbor lists - // - counts_tensor: [N] int64 — number of neighbors for each source node + // - node_ids_tensor: [N] int64 — source node IDs fetched for this edge type + // - flat_nbrs_tensor: [sum(counts)] int64 — all neighbor lists concatenated flat + // - counts_tensor: [N] int64 — neighbor count for each source node + // + // For example, if nodes 3 and 7 were fetched for etype 0: + // node_ids = [3, 7] + // flat_nbrs = [10, 11, 12, 20] ← node 3 has nbrs {10,11,12}, node 7 has nbr {20} + // counts = [3, 1] void push_residuals(py::dict fetched_by_etype_id) { - // Build local fetched map: pack_key(node_id, etype_id) -> neighbor list. + // Step 1: Unpack the Python dict into a C++ map for fast lookup during + // the residual-push loop below. + // fetched: pack_key(node_id, etype_id) → neighbor list (as int32_t vector) std::unordered_map> fetched; for (auto item : fetched_by_etype_id) { - int32_t eid = item.first.cast(); + int32_t eid = item.first.cast(); + // .cast() interprets the Python value as a tuple so we + // can index into it with [0], [1], [2]. auto tup = item.second.cast(); auto node_ids_t = tup[0].cast(); auto flat_nbrs_t = tup[1].cast(); auto counts_t = tup[2].cast(); - auto node_acc = node_ids_t.accessor(); - auto nbr_acc = flat_nbrs_t.accessor(); - auto cnt_acc = counts_t.accessor(); + // accessor() gives a bounds-checked, typed 1-D view into + // each tensor's data — equivalent to iterating over a NumPy array. + auto node_acc = node_ids_t.accessor(); + auto nbr_acc = flat_nbrs_t.accessor(); + auto cnt_acc = counts_t.accessor(); + // Walk the flat neighbor list, slicing out each node's neighbors using + // the running offset into the concatenated flat buffer. int64_t offset = 0; for (int64_t i = 0; i < node_ids_t.size(0); ++i) { int32_t nid = static_cast(node_acc[i]); @@ -136,54 +200,85 @@ class PPRForwardPushState { std::vector nbrs(count); for (int64_t j = 0; j < count; ++j) nbrs[j] = static_cast(nbr_acc[offset + j]); + // std::move: hand off nbrs to the map without copying its contents. fetched[pack_key(nid, eid)] = std::move(nbrs); offset += count; } } + // Step 2: For every node that was in the queue (captured in queued_nodes_ + // by drain_queue()), apply one PPR push step: + // a. Absorb residual into the PPR score. + // b. Distribute (1-alpha) * residual equally to each neighbor. + // c. Enqueue any neighbor whose residual now exceeds the requeue threshold. for (int32_t s = 0; s < batch_size_; ++s) { for (int32_t nt = 0; nt < num_node_types_; ++nt) { if (queued_nodes_[s][nt].empty()) continue; for (int32_t src : queued_nodes_[s][nt]) { + // `auto&` gives a reference to the residual map for this + // (seed, node_type) pair so we can read and write it without + // an extra hash lookup each time. auto& src_res = residuals_[s][nt]; + // .find() returns an iterator; .end() means "not found". + // We treat a missing entry as residual = 0. auto it = src_res.find(src); double res = (it != src_res.end()) ? it->second : 0.0; + // a. Absorb: move residual into the PPR score. ppr_scores_[s][nt][src] += res; src_res[src] = 0.0; int32_t total_deg = get_total_degree(src, nt); + // Destination-only nodes (no outgoing edges) absorb residual + // into their PPR score but do not push further. if (total_deg == 0) continue; + // b. Distribute: each neighbor of src (across all edge types + // from nt) receives an equal share of the pushed residual. double res_per_nbr = one_minus_alpha_ * res / static_cast(total_deg); for (int32_t eid : node_type_to_edge_type_ids_[nt]) { - // fetched and neighbor_cache are mutually exclusive per iteration: - // drain_queue only adds a node to nodes_to_lookup when absent from - // neighbor_cache, so a given key appears in at most one of the two. + // Invariant: fetched and neighbor_cache_ are mutually exclusive for + // any given (node, etype) key within one iteration. drain_queue() + // only requests a fetch for nodes absent from neighbor_cache_, so a + // key is in at most one of the two. We check fetched first since it + // is the common case for newly-seen nodes. + // + // `const std::vector*` is a pointer to a neighbor list. + // We use a pointer (rather than copying the list) so we can check + // for absence with nullptr without allocating anything. const std::vector* nbr_list = nullptr; auto fi = fetched.find(pack_key(src, eid)); if (fi != fetched.end()) { + // `&fi->second` takes the address of the vector stored in + // the map — nbr_list now points to it without copying. nbr_list = &fi->second; } else { auto ci = neighbor_cache_.find(pack_key(src, eid)); if (ci != neighbor_cache_.end()) nbr_list = &ci->second; } + // Skip if no neighbor list is available (node has no edges of + // this type, or the fetch returned an empty list). if (!nbr_list || nbr_list->empty()) continue; int32_t dst_nt = edge_type_to_dst_ntype_id_[eid]; + // c. For each neighbor, accumulate residual and check threshold. + // `*nbr_list` dereferences the pointer to access the vector. for (int32_t nbr : *nbr_list) { residuals_[s][dst_nt][nbr] += res_per_nbr; double threshold = requeue_threshold_factor_ * static_cast(get_total_degree(nbr, dst_nt)); + // Only enqueue if: (1) not already in queue for this + // iteration, and (2) residual exceeds the push threshold + // alpha * eps * degree. if (queue_[s][dst_nt].find(nbr) == queue_[s][dst_nt].end() && residuals_[s][dst_nt][nbr] >= threshold) { queue_[s][dst_nt].insert(nbr); - ++num_nodes_in_queue_; + ++num_nodes_in_queue_; // ++x is equivalent to x += 1 // Promote this node's neighbor lists to the persistent cache: // it will be processed next iteration, so caching now avoids @@ -207,9 +302,16 @@ class PPRForwardPushState { } // Extract top-k PPR nodes per seed per node type. + // // Returns {ntype_id: (flat_ids_tensor, flat_weights_tensor, valid_counts_tensor)}. // Only node types that received any PPR score are included in the output. + // + // Output layout for a batch of B seeds (same structure as _batch_fetch_neighbors): + // flat_ids[0 : valid_counts[0]] → top-k nodes for seed 0 + // flat_ids[valid_counts[0] : valid_counts[0]+valid_counts[1]] → top-k for seed 1 + // ... py::dict extract_top_k(int32_t max_ppr_nodes) { + // Collect node types that have any PPR score — skip types with no activity. std::unordered_set active; for (int32_t s = 0; s < batch_size_; ++s) for (int32_t nt = 0; nt < num_node_types_; ++nt) @@ -217,25 +319,42 @@ class PPRForwardPushState { py::dict result; for (int32_t nt : active) { + // Flat output vectors — entries for all seeds are concatenated. std::vector flat_ids; std::vector flat_weights; std::vector valid_counts; for (int32_t s = 0; s < batch_size_; ++s) { + // `const auto&` is a read-only reference — we iterate the map + // without copying it. const auto& scores = ppr_scores_[s][nt]; + // Cap k at the number of nodes that actually have a score. int32_t k = std::min(max_ppr_nodes, static_cast(scores.size())); if (k > 0) { + // Copy the map entries into a vector of (node_id, score) pairs + // so they can be sorted. std::pair is like a Python 2-tuple. std::vector> items(scores.begin(), scores.end()); + + // std::partial_sort rearranges items so that the first k entries + // are the k largest — like Python's heapq.nlargest but in-place. + // The lambda `[](const auto& a, const auto& b) { return ...; }` + // is an anonymous comparator (like Python's `key=` argument). + // `.second` accesses the score (second element of the pair); + // `>` makes it descending (highest score first). std::partial_sort(items.begin(), items.begin() + k, items.end(), [](const auto& a, const auto& b) { return a.second > b.second; }); + for (int32_t i = 0; i < k; ++i) { flat_ids.push_back(static_cast(items[i].first)); + // Cast to float32 for output; internal scores stay double to + // avoid accumulated rounding errors in the push loop above. flat_weights.push_back(static_cast(items[i].second)); } } valid_counts.push_back(static_cast(k)); } + // py::make_tuple wraps C++ values into a Python tuple. result[py::int_(nt)] = py::make_tuple( torch::tensor(flat_ids, torch::kLong), torch::tensor(flat_weights, torch::kFloat), @@ -246,6 +365,8 @@ class PPRForwardPushState { } private: + // Look up the total (across all edge types) out-degree of a node. + // Returns 0 for destination-only node types (no outgoing edges). int32_t get_total_degree(int32_t node_id, int32_t ntype_id) const { if (ntype_id >= static_cast(degree_tensors_.size())) return 0; const auto& t = degree_tensors_[ntype_id]; @@ -255,34 +376,75 @@ class PPRForwardPushState { "Node ID ", node_id, " out of range for degree tensor of ntype_id ", ntype_id, " (size=", t.size(0), "). This indicates corrupted graph data or a sampler bug." ); + // data_ptr() returns a raw C pointer to the tensor's int32 data + // buffer. Direct pointer indexing ([node_id]) is safe here because we + // validated the bounds with TORCH_CHECK above. return t.data_ptr()[node_id]; } - double alpha_, one_minus_alpha_, requeue_threshold_factor_; - int32_t batch_size_, num_node_types_, num_nodes_in_queue_{0}; - + // ------------------------------------------------------------------------- + // Scalar algorithm parameters + // ------------------------------------------------------------------------- + double alpha_; // Restart probability + double one_minus_alpha_; // 1 - alpha, precomputed to avoid repeated subtraction + double requeue_threshold_factor_; // alpha * eps; multiplied by degree to get per-node threshold + + int32_t batch_size_; // Number of seeds in the current batch + int32_t num_node_types_; // Total number of node types (homo + hetero) + int32_t num_nodes_in_queue_{0}; // Running count of nodes across all seeds / types + + // ------------------------------------------------------------------------- + // Graph structure (read-only after construction) + // ------------------------------------------------------------------------- + // node_type_to_edge_type_ids_[ntype_id] → list of edge type IDs that can be + // traversed from that node type (outgoing or incoming, depending on edge_dir). std::vector> node_type_to_edge_type_ids_; + // edge_type_to_dst_ntype_id_[eid] → node type ID at the destination end. std::vector edge_type_to_dst_ntype_id_; + // degree_tensors_[ntype_id][node_id] → total degree of that node across all + // edge types traversable from its type. Empty tensor means no outgoing edges. std::vector degree_tensors_; - // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id]). + // ------------------------------------------------------------------------- + // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id]) + // ------------------------------------------------------------------------- // double precision avoids float32 rounding errors accumulating over 20-30 // push iterations, which would otherwise cause ~1e-4 score errors vs the // true PPR. Output weights are cast to float32 in extract_top_k. + // + // ppr_scores_[s][nt]: node_id → absorbed PPR score (Σ of residuals pushed so far) std::vector>> ppr_scores_; + // residuals_[s][nt]: node_id → unabsorbed probability mass waiting to be pushed std::vector>> residuals_; + // queue_[s][nt]: nodes whose residual exceeds the threshold and need a push next round std::vector>> queue_; - // Snapshot of queue contents from the last drain_queue() call, used by push_residuals(). + // queued_nodes_[s][nt]: snapshot of queue_ taken by drain_queue() for the current round. + // Separating it from queue_ lets push_residuals() enqueue new nodes into queue_ without + // modifying the set currently being iterated. std::vector>> queued_nodes_; - // Persistent neighbor cache: pack_key(node_id, etype_id) -> neighbor list. - // Only nodes that have been requeued (and thus will be processed again) are - // promoted here from the per-iteration fetched map. + // ------------------------------------------------------------------------- + // Neighbor cache + // ------------------------------------------------------------------------- + // Persistent cache: pack_key(node_id, etype_id) → neighbor list. + // Only nodes that have been re-queued (and will therefore be processed again) + // are promoted here from the per-iteration fetched map in push_residuals(). + // This avoids re-fetching neighbors for nodes processed in multiple iterations + // while keeping large neighbor lists of high-degree (never-requeued) nodes + // out of memory. std::unordered_map> neighbor_cache_; }; +// Register PPRForwardPushState with Python via pybind11. +// +// TORCH_EXTENSION_NAME is set by PyTorch's setup() at build time to match the +// Python module name (e.g. "ppr_forward_push"). At import time, Python calls +// this function to populate the module with the C++ class. PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { py::class_(m, "PPRForwardPushState") + // .def(py::init<...>()) exposes the constructor. The template arguments + // list the exact C++ parameter types so pybind11 can convert Python + // arguments to the correct C++ types automatically. .def(py::init< torch::Tensor, int32_t, From 906df014092f388eba9274121e6d838c81b8abdd Mon Sep 17 00:00:00 2001 From: mkolodner Date: Wed, 25 Mar 2026 18:48:27 +0000 Subject: [PATCH 05/14] Apply clang-format to ppr_forward_push.cpp --- .../cpp_extensions/ppr_forward_push.cpp | 127 +++++++++--------- 1 file changed, 63 insertions(+), 64 deletions(-) diff --git a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp index e22ac264f..0af3eb2b5 100644 --- a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp +++ b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp @@ -1,8 +1,8 @@ +#include // Automatic conversion between C++ containers and Python types #include // PyTorch C++ API (tensors, TORCH_CHECK) -#include // Automatic conversion between C++ containers and Python types -#include // std::partial_sort, std::min -#include // Fixed-width integer types: int32_t, int64_t, uint32_t, uint64_t +#include // std::partial_sort, std::min +#include // Fixed-width integer types: int32_t, int64_t, uint32_t, uint64_t #include // std::unordered_map — like Python dict, O(1) average lookup #include // std::unordered_set — like Python set, O(1) average lookup #include // std::vector — like Python list, contiguous in memory @@ -47,16 +47,12 @@ static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) { // 4. push_residuals(fetched_by_etype_id) — push residuals, update queue // 5. extract_top_k(max_ppr_nodes) — top-k selection per seed per node type class PPRForwardPushState { -public: - PPRForwardPushState( - torch::Tensor seed_nodes, - int32_t seed_node_type_id, - double alpha, - double requeue_threshold_factor, - std::vector> node_type_to_edge_type_ids, - std::vector edge_type_to_dst_ntype_id, - std::vector degree_tensors - ) + public: + PPRForwardPushState(torch::Tensor seed_nodes, int32_t seed_node_type_id, double alpha, + double requeue_threshold_factor, + std::vector> node_type_to_edge_type_ids, + std::vector edge_type_to_dst_ntype_id, + std::vector degree_tensors) : alpha_(alpha), one_minus_alpha_(1.0 - alpha), requeue_threshold_factor_(requeue_threshold_factor), @@ -66,7 +62,6 @@ class PPRForwardPushState { node_type_to_edge_type_ids_(std::move(node_type_to_edge_type_ids)), edge_type_to_dst_ntype_id_(std::move(edge_type_to_dst_ntype_id)), degree_tensors_(std::move(degree_tensors)) { - TORCH_CHECK(seed_nodes.dim() == 1, "seed_nodes must be 1D"); batch_size_ = static_cast(seed_nodes.size(0)); num_node_types_ = static_cast(node_type_to_edge_type_ids_.size()); @@ -74,15 +69,18 @@ class PPRForwardPushState { // Allocate per-seed, per-node-type tables. // .assign(n, val) fills a vector with n copies of val — like [val] * n in Python. // Each inner element is an empty hash map / hash set for that (seed, ntype) pair. - ppr_scores_.assign(batch_size_, std::vector>(num_node_types_)); - residuals_.assign(batch_size_, std::vector>(num_node_types_)); - queue_.assign(batch_size_, std::vector>(num_node_types_)); - queued_nodes_.assign(batch_size_, std::vector>(num_node_types_)); + ppr_scores_.assign(batch_size_, + std::vector>(num_node_types_)); + residuals_.assign(batch_size_, + std::vector>(num_node_types_)); + queue_.assign(batch_size_, std::vector>(num_node_types_)); + queued_nodes_.assign(batch_size_, + std::vector>(num_node_types_)); // accessor() returns a typed view into the tensor's data that // supports [i] indexing with bounds checking in debug builds. Here we read // each seed node ID from the 1-D int64 tensor. - auto acc = seed_nodes.accessor(); + auto acc = seed_nodes.accessor(); num_nodes_in_queue_ = batch_size_; for (int32_t i = 0; i < batch_size_; ++i) { // static_cast: explicit narrowing from int64 to int32. @@ -116,7 +114,8 @@ class PPRForwardPushState { // (alias) to the existing set — clearing it modifies the original in-place // rather than operating on a copy. for (int32_t s = 0; s < batch_size_; ++s) - for (auto& qs : queued_nodes_[s]) qs.clear(); + for (auto& qs : queued_nodes_[s]) + qs.clear(); // nodes_to_lookup[eid] = set of node IDs that need a neighbor fetch for // edge type eid this round. Using a set deduplicates nodes that appear @@ -126,7 +125,8 @@ class PPRForwardPushState { for (int32_t s = 0; s < batch_size_; ++s) { for (int32_t nt = 0; nt < num_node_types_; ++nt) { - if (queue_[s][nt].empty()) continue; + if (queue_[s][nt].empty()) + continue; // Move the live queue into the snapshot (no data copy — O(1)). // queue_ is then reset to an empty set so new entries added by @@ -213,7 +213,8 @@ class PPRForwardPushState { // c. Enqueue any neighbor whose residual now exceeds the requeue threshold. for (int32_t s = 0; s < batch_size_; ++s) { for (int32_t nt = 0; nt < num_node_types_; ++nt) { - if (queued_nodes_[s][nt].empty()) continue; + if (queued_nodes_[s][nt].empty()) + continue; for (int32_t src : queued_nodes_[s][nt]) { // `auto&` gives a reference to the residual map for this @@ -222,7 +223,7 @@ class PPRForwardPushState { auto& src_res = residuals_[s][nt]; // .find() returns an iterator; .end() means "not found". // We treat a missing entry as residual = 0. - auto it = src_res.find(src); + auto it = src_res.find(src); double res = (it != src_res.end()) ? it->second : 0.0; // a. Absorb: move residual into the PPR score. @@ -232,7 +233,8 @@ class PPRForwardPushState { int32_t total_deg = get_total_degree(src, nt); // Destination-only nodes (no outgoing edges) absorb residual // into their PPR score but do not push further. - if (total_deg == 0) continue; + if (total_deg == 0) + continue; // b. Distribute: each neighbor of src (across all edge types // from nt) receives an equal share of the pushed residual. @@ -249,18 +251,20 @@ class PPRForwardPushState { // We use a pointer (rather than copying the list) so we can check // for absence with nullptr without allocating anything. const std::vector* nbr_list = nullptr; - auto fi = fetched.find(pack_key(src, eid)); + auto fi = fetched.find(pack_key(src, eid)); if (fi != fetched.end()) { // `&fi->second` takes the address of the vector stored in // the map — nbr_list now points to it without copying. nbr_list = &fi->second; } else { auto ci = neighbor_cache_.find(pack_key(src, eid)); - if (ci != neighbor_cache_.end()) nbr_list = &ci->second; + if (ci != neighbor_cache_.end()) + nbr_list = &ci->second; } // Skip if no neighbor list is available (node has no edges of // this type, or the fetch returned an empty list). - if (!nbr_list || nbr_list->empty()) continue; + if (!nbr_list || nbr_list->empty()) + continue; int32_t dst_nt = edge_type_to_dst_ntype_id_[eid]; @@ -270,7 +274,7 @@ class PPRForwardPushState { residuals_[s][dst_nt][nbr] += res_per_nbr; double threshold = requeue_threshold_factor_ * - static_cast(get_total_degree(nbr, dst_nt)); + static_cast(get_total_degree(nbr, dst_nt)); // Only enqueue if: (1) not already in queue for this // iteration, and (2) residual exceeds the push threshold @@ -315,13 +319,14 @@ class PPRForwardPushState { std::unordered_set active; for (int32_t s = 0; s < batch_size_; ++s) for (int32_t nt = 0; nt < num_node_types_; ++nt) - if (!ppr_scores_[s][nt].empty()) active.insert(nt); + if (!ppr_scores_[s][nt].empty()) + active.insert(nt); py::dict result; for (int32_t nt : active) { // Flat output vectors — entries for all seeds are concatenated. std::vector flat_ids; - std::vector flat_weights; + std::vector flat_weights; std::vector valid_counts; for (int32_t s = 0; s < batch_size_; ++s) { @@ -341,7 +346,8 @@ class PPRForwardPushState { // is an anonymous comparator (like Python's `key=` argument). // `.second` accesses the score (second element of the pair); // `>` makes it descending (highest score first). - std::partial_sort(items.begin(), items.begin() + k, items.end(), + std::partial_sort( + items.begin(), items.begin() + k, items.end(), [](const auto& a, const auto& b) { return a.second > b.second; }); for (int32_t i = 0; i < k; ++i) { @@ -355,27 +361,25 @@ class PPRForwardPushState { } // py::make_tuple wraps C++ values into a Python tuple. - result[py::int_(nt)] = py::make_tuple( - torch::tensor(flat_ids, torch::kLong), - torch::tensor(flat_weights, torch::kFloat), - torch::tensor(valid_counts, torch::kLong) - ); + result[py::int_(nt)] = py::make_tuple(torch::tensor(flat_ids, torch::kLong), + torch::tensor(flat_weights, torch::kFloat), + torch::tensor(valid_counts, torch::kLong)); } return result; } -private: + private: // Look up the total (across all edge types) out-degree of a node. // Returns 0 for destination-only node types (no outgoing edges). int32_t get_total_degree(int32_t node_id, int32_t ntype_id) const { - if (ntype_id >= static_cast(degree_tensors_.size())) return 0; + if (ntype_id >= static_cast(degree_tensors_.size())) + return 0; const auto& t = degree_tensors_[ntype_id]; - if (t.numel() == 0) return 0; // destination-only type: no outgoing edges - TORCH_CHECK( - node_id < static_cast(t.size(0)), - "Node ID ", node_id, " out of range for degree tensor of ntype_id ", ntype_id, - " (size=", t.size(0), "). This indicates corrupted graph data or a sampler bug." - ); + if (t.numel() == 0) + return 0; // destination-only type: no outgoing edges + TORCH_CHECK(node_id < static_cast(t.size(0)), "Node ID ", node_id, + " out of range for degree tensor of ntype_id ", ntype_id, " (size=", t.size(0), + "). This indicates corrupted graph data or a sampler bug."); // data_ptr() returns a raw C pointer to the tensor's int32 data // buffer. Direct pointer indexing ([node_id]) is safe here because we // validated the bounds with TORCH_CHECK above. @@ -385,13 +389,14 @@ class PPRForwardPushState { // ------------------------------------------------------------------------- // Scalar algorithm parameters // ------------------------------------------------------------------------- - double alpha_; // Restart probability - double one_minus_alpha_; // 1 - alpha, precomputed to avoid repeated subtraction - double requeue_threshold_factor_; // alpha * eps; multiplied by degree to get per-node threshold + double alpha_; // Restart probability + double one_minus_alpha_; // 1 - alpha, precomputed to avoid repeated subtraction + double + requeue_threshold_factor_; // alpha * eps; multiplied by degree to get per-node threshold - int32_t batch_size_; // Number of seeds in the current batch - int32_t num_node_types_; // Total number of node types (homo + hetero) - int32_t num_nodes_in_queue_{0}; // Running count of nodes across all seeds / types + int32_t batch_size_; // Number of seeds in the current batch + int32_t num_node_types_; // Total number of node types (homo + hetero) + int32_t num_nodes_in_queue_{0}; // Running count of nodes across all seeds / types // ------------------------------------------------------------------------- // Graph structure (read-only after construction) @@ -400,10 +405,10 @@ class PPRForwardPushState { // traversed from that node type (outgoing or incoming, depending on edge_dir). std::vector> node_type_to_edge_type_ids_; // edge_type_to_dst_ntype_id_[eid] → node type ID at the destination end. - std::vector edge_type_to_dst_ntype_id_; + std::vector edge_type_to_dst_ntype_id_; // degree_tensors_[ntype_id][node_id] → total degree of that node across all // edge types traversable from its type. Empty tensor means no outgoing edges. - std::vector degree_tensors_; + std::vector degree_tensors_; // ------------------------------------------------------------------------- // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id]) @@ -417,11 +422,11 @@ class PPRForwardPushState { // residuals_[s][nt]: node_id → unabsorbed probability mass waiting to be pushed std::vector>> residuals_; // queue_[s][nt]: nodes whose residual exceeds the threshold and need a push next round - std::vector>> queue_; + std::vector>> queue_; // queued_nodes_[s][nt]: snapshot of queue_ taken by drain_queue() for the current round. // Separating it from queue_ lets push_residuals() enqueue new nodes into queue_ without // modifying the set currently being iterated. - std::vector>> queued_nodes_; + std::vector>> queued_nodes_; // ------------------------------------------------------------------------- // Neighbor cache @@ -445,15 +450,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { // .def(py::init<...>()) exposes the constructor. The template arguments // list the exact C++ parameter types so pybind11 can convert Python // arguments to the correct C++ types automatically. - .def(py::init< - torch::Tensor, - int32_t, - double, double, - std::vector>, - std::vector, - std::vector - >()) - .def("drain_queue", &PPRForwardPushState::drain_queue) + .def(py::init>, + std::vector, std::vector>()) + .def("drain_queue", &PPRForwardPushState::drain_queue) .def("push_residuals", &PPRForwardPushState::push_residuals) - .def("extract_top_k", &PPRForwardPushState::extract_top_k); + .def("extract_top_k", &PPRForwardPushState::extract_top_k); } From dd118ef09320b0afea72f79b93dbc3d691a6e4be Mon Sep 17 00:00:00 2001 From: mkolodner Date: Wed, 25 Mar 2026 20:49:28 +0000 Subject: [PATCH 06/14] Move PPR C++ to gigl/csrc following PyTorch csrc conventions --- gigl/csrc/distributed/__init__.py | 9 + gigl/csrc/distributed/ppr_forward_push.cpp | 247 ++++++++++ gigl/csrc/distributed/ppr_forward_push.h | 121 +++++ .../distributed}/ppr_forward_push.pyi | 0 .../distributed/python_ppr_forward_push.cpp | 63 +++ gigl/distributed/cpp_extensions/__init__.py | 9 - .../cpp_extensions/ppr_forward_push.cpp | 458 ------------------ gigl/distributed/dist_ppr_sampler.py | 2 +- 8 files changed, 441 insertions(+), 468 deletions(-) create mode 100644 gigl/csrc/distributed/__init__.py create mode 100644 gigl/csrc/distributed/ppr_forward_push.cpp create mode 100644 gigl/csrc/distributed/ppr_forward_push.h rename gigl/{distributed/cpp_extensions => csrc/distributed}/ppr_forward_push.pyi (100%) create mode 100644 gigl/csrc/distributed/python_ppr_forward_push.cpp delete mode 100644 gigl/distributed/cpp_extensions/__init__.py delete mode 100644 gigl/distributed/cpp_extensions/ppr_forward_push.cpp diff --git a/gigl/csrc/distributed/__init__.py b/gigl/csrc/distributed/__init__.py new file mode 100644 index 000000000..d8ffa921a --- /dev/null +++ b/gigl/csrc/distributed/__init__.py @@ -0,0 +1,9 @@ +try: + from gigl.csrc.distributed.ppr_forward_push import PPRForwardPushState +except ImportError as e: + raise ImportError( + "PPR C++ extension not compiled. " + "Run `make build_cpp_extensions` from the GiGL root to build it." + ) from e + +__all__ = ["PPRForwardPushState"] diff --git a/gigl/csrc/distributed/ppr_forward_push.cpp b/gigl/csrc/distributed/ppr_forward_push.cpp new file mode 100644 index 000000000..a514907ab --- /dev/null +++ b/gigl/csrc/distributed/ppr_forward_push.cpp @@ -0,0 +1,247 @@ +#include "ppr_forward_push.h" + +PPRForwardPushState::PPRForwardPushState( + torch::Tensor seed_nodes, int32_t seed_node_type_id, double alpha, + double requeue_threshold_factor, + std::vector> node_type_to_edge_type_ids, + std::vector edge_type_to_dst_ntype_id, std::vector degree_tensors) + : alpha_(alpha), + one_minus_alpha_(1.0 - alpha), + requeue_threshold_factor_(requeue_threshold_factor), + // std::move transfers ownership of each vector into the member variable + // without copying its contents — equivalent to Python's list hand-off + // when you no longer need the original. + node_type_to_edge_type_ids_(std::move(node_type_to_edge_type_ids)), + edge_type_to_dst_ntype_id_(std::move(edge_type_to_dst_ntype_id)), + degree_tensors_(std::move(degree_tensors)) { + TORCH_CHECK(seed_nodes.dim() == 1, "seed_nodes must be 1D"); + batch_size_ = static_cast(seed_nodes.size(0)); + num_node_types_ = static_cast(node_type_to_edge_type_ids_.size()); + + // Allocate per-seed, per-node-type tables. + // .assign(n, val) fills a vector with n copies of val — like [val] * n in Python. + ppr_scores_.assign(batch_size_, + std::vector>(num_node_types_)); + residuals_.assign(batch_size_, + std::vector>(num_node_types_)); + queue_.assign(batch_size_, std::vector>(num_node_types_)); + queued_nodes_.assign(batch_size_, + std::vector>(num_node_types_)); + + // accessor() returns a typed view into the tensor's data that + // supports [i] indexing with bounds checking in debug builds. + auto acc = seed_nodes.accessor(); + num_nodes_in_queue_ = batch_size_; + for (int32_t i = 0; i < batch_size_; ++i) { + int32_t seed = static_cast(acc[i]); + // PPR initialisation: each seed starts with residual = alpha (the + // restart probability). The first push will move alpha into ppr_score + // and distribute (1-alpha)*alpha to the seed's neighbors. + residuals_[i][seed_node_type_id][seed] = alpha_; + queue_[i][seed_node_type_id].insert(seed); + } +} + +std::optional> PPRForwardPushState::drain_queue() { + if (num_nodes_in_queue_ == 0) { + return std::nullopt; + } + + // Reset the snapshot from the previous iteration. + for (int32_t s = 0; s < batch_size_; ++s) + for (auto& qs : queued_nodes_[s]) + qs.clear(); + + // nodes_to_lookup[eid] = set of node IDs that need a neighbor fetch for + // edge type eid this round. Using a set deduplicates nodes that appear + // in multiple seeds' queues: we only fetch each (node, etype) pair once. + std::unordered_map> nodes_to_lookup; + + for (int32_t s = 0; s < batch_size_; ++s) { + for (int32_t nt = 0; nt < num_node_types_; ++nt) { + if (queue_[s][nt].empty()) + continue; + + // Move the live queue into the snapshot (no data copy — O(1)). + queued_nodes_[s][nt] = std::move(queue_[s][nt]); + queue_[s][nt].clear(); + num_nodes_in_queue_ -= static_cast(queued_nodes_[s][nt].size()); + + for (int32_t node_id : queued_nodes_[s][nt]) { + for (int32_t eid : node_type_to_edge_type_ids_[nt]) { + if (neighbor_cache_.find(pack_key(node_id, eid)) == neighbor_cache_.end()) { + nodes_to_lookup[eid].insert(node_id); + } + } + } + } + } + + std::unordered_map result; + for (auto& [eid, node_set] : nodes_to_lookup) { + std::vector ids(node_set.begin(), node_set.end()); + result[eid] = torch::tensor(ids, torch::kLong); + } + return result; +} + +void PPRForwardPushState::push_residuals( + const std::unordered_map>& + fetched_by_etype_id) { + // Step 1: Unpack the input map into a C++ map keyed by pack_key(node_id, etype_id) + // for fast lookup during the residual-push loop below. + std::unordered_map> fetched; + for (const auto& [eid, tup] : fetched_by_etype_id) { + const auto& node_ids_t = std::get<0>(tup); + const auto& flat_nbrs_t = std::get<1>(tup); + const auto& counts_t = std::get<2>(tup); + + // accessor() gives a bounds-checked, typed 1-D view into + // each tensor's data — equivalent to iterating over a NumPy array. + auto node_acc = node_ids_t.accessor(); + auto nbr_acc = flat_nbrs_t.accessor(); + auto cnt_acc = counts_t.accessor(); + + // Walk the flat neighbor list, slicing out each node's neighbors using + // the running offset into the concatenated flat buffer. + int64_t offset = 0; + for (int64_t i = 0; i < node_ids_t.size(0); ++i) { + int32_t nid = static_cast(node_acc[i]); + int64_t count = cnt_acc[i]; + std::vector nbrs(count); + for (int64_t j = 0; j < count; ++j) + nbrs[j] = static_cast(nbr_acc[offset + j]); + fetched[pack_key(nid, eid)] = std::move(nbrs); + offset += count; + } + } + + // Step 2: For every node that was in the queue (captured in queued_nodes_ + // by drain_queue()), apply one PPR push step: + // a. Absorb residual into the PPR score. + // b. Distribute (1-alpha) * residual equally to each neighbor. + // c. Enqueue any neighbor whose residual now exceeds the requeue threshold. + for (int32_t s = 0; s < batch_size_; ++s) { + for (int32_t nt = 0; nt < num_node_types_; ++nt) { + if (queued_nodes_[s][nt].empty()) + continue; + + for (int32_t src : queued_nodes_[s][nt]) { + auto& src_res = residuals_[s][nt]; + auto it = src_res.find(src); + double res = (it != src_res.end()) ? it->second : 0.0; + + // a. Absorb: move residual into the PPR score. + ppr_scores_[s][nt][src] += res; + src_res[src] = 0.0; + + int32_t total_deg = get_total_degree(src, nt); + // Destination-only nodes absorb residual but do not push further. + if (total_deg == 0) + continue; + + // b. Distribute: each neighbor receives an equal share. + double res_per_nbr = one_minus_alpha_ * res / static_cast(total_deg); + + for (int32_t eid : node_type_to_edge_type_ids_[nt]) { + // Invariant: fetched and neighbor_cache_ are mutually exclusive for + // any given (node, etype) key within one iteration. drain_queue() + // only requests a fetch for nodes absent from neighbor_cache_, so a + // key is in at most one of the two. + const std::vector* nbr_list = nullptr; + auto fi = fetched.find(pack_key(src, eid)); + if (fi != fetched.end()) { + nbr_list = &fi->second; + } else { + auto ci = neighbor_cache_.find(pack_key(src, eid)); + if (ci != neighbor_cache_.end()) + nbr_list = &ci->second; + } + if (!nbr_list || nbr_list->empty()) + continue; + + int32_t dst_nt = edge_type_to_dst_ntype_id_[eid]; + + // c. Accumulate residual for each neighbor and re-enqueue if threshold + // exceeded. + for (int32_t nbr : *nbr_list) { + residuals_[s][dst_nt][nbr] += res_per_nbr; + + double threshold = requeue_threshold_factor_ * + static_cast(get_total_degree(nbr, dst_nt)); + + if (queue_[s][dst_nt].find(nbr) == queue_[s][dst_nt].end() && + residuals_[s][dst_nt][nbr] >= threshold) { + queue_[s][dst_nt].insert(nbr); + ++num_nodes_in_queue_; + + // Promote neighbor lists to the persistent cache: this node will + // be processed next iteration, so caching avoids a re-fetch. + for (int32_t peid : node_type_to_edge_type_ids_[dst_nt]) { + uint64_t pk = pack_key(nbr, peid); + if (neighbor_cache_.find(pk) == neighbor_cache_.end()) { + auto pfi = fetched.find(pk); + if (pfi != fetched.end()) + neighbor_cache_[pk] = pfi->second; + } + } + } + } + } + } + } + } +} + +std::unordered_map> +PPRForwardPushState::extract_top_k(int32_t max_ppr_nodes) { + std::unordered_set active; + for (int32_t s = 0; s < batch_size_; ++s) + for (int32_t nt = 0; nt < num_node_types_; ++nt) + if (!ppr_scores_[s][nt].empty()) + active.insert(nt); + + std::unordered_map> result; + for (int32_t nt : active) { + std::vector flat_ids; + std::vector flat_weights; + std::vector valid_counts; + + for (int32_t s = 0; s < batch_size_; ++s) { + const auto& scores = ppr_scores_[s][nt]; + int32_t k = std::min(max_ppr_nodes, static_cast(scores.size())); + if (k > 0) { + std::vector> items(scores.begin(), scores.end()); + std::partial_sort( + items.begin(), items.begin() + k, items.end(), + [](const auto& a, const auto& b) { return a.second > b.second; }); + + for (int32_t i = 0; i < k; ++i) { + flat_ids.push_back(static_cast(items[i].first)); + // Cast to float32 for output; internal scores stay double to + // avoid accumulated rounding errors in the push loop. + flat_weights.push_back(static_cast(items[i].second)); + } + } + valid_counts.push_back(static_cast(k)); + } + + result[nt] = {torch::tensor(flat_ids, torch::kLong), + torch::tensor(flat_weights, torch::kFloat), + torch::tensor(valid_counts, torch::kLong)}; + } + return result; +} + +int32_t PPRForwardPushState::get_total_degree(int32_t node_id, int32_t ntype_id) const { + if (ntype_id >= static_cast(degree_tensors_.size())) + return 0; + const auto& t = degree_tensors_[ntype_id]; + if (t.numel() == 0) + return 0; + TORCH_CHECK(node_id < static_cast(t.size(0)), "Node ID ", node_id, + " out of range for degree tensor of ntype_id ", ntype_id, " (size=", t.size(0), + "). This indicates corrupted graph data or a sampler bug."); + // data_ptr() returns a raw C pointer to the tensor's int32 data buffer. + return t.data_ptr()[node_id]; +} diff --git a/gigl/csrc/distributed/ppr_forward_push.h b/gigl/csrc/distributed/ppr_forward_push.h new file mode 100644 index 000000000..7f0c92f49 --- /dev/null +++ b/gigl/csrc/distributed/ppr_forward_push.h @@ -0,0 +1,121 @@ +#pragma once + +#include + +#include // std::partial_sort, std::min +#include // Fixed-width integer types: int32_t, int64_t, uint32_t, uint64_t +#include // std::optional for nullable return values +#include // std::tuple for multi-value returns +#include // std::unordered_map — like Python dict, O(1) average lookup +#include // std::unordered_set — like Python set, O(1) average lookup +#include // std::vector — like Python list, contiguous in memory + +// Combine (node_id, etype_id) into a single 64-bit integer for use as a hash +// map key. A single 64-bit integer is cheaper to hash than a pair of two +// integers (std::unordered_map has no built-in pair hash). +// +// Bit layout: +// bits 63–32: node_id (upper half) +// bits 31– 0: etype_id (lower half) +// +// Both inputs are cast through uint32_t before packing. Without this, a +// negative int32_t (e.g. -1 = 0xFFFFFFFF) would be sign-extended to a full +// 64-bit value, corrupting the upper bits when shifted. Reinterpreting as +// uint32_t first treats the bit pattern as-is (no sign extension). +static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) { + return (static_cast(static_cast(node_id)) << 32) | + static_cast(etype_id); +} + +// C++ kernel for the PPR Forward Push algorithm (Andersen et al., 2006). +// +// All hot-loop state (scores, residuals, queue, neighbor cache) lives inside +// this object. The distributed neighbor fetch is kept in Python because it +// involves async RPC calls that C++ cannot drive directly. +// +// Owned state: ppr_scores, residuals, queue, queued_nodes, neighbor_cache. +// Python retains ownership of: the distributed neighbor fetch (_batch_fetch_neighbors). +// +// Typical call sequence per batch: +// 1. PPRForwardPushState(seed_nodes, ...) — init per-seed residuals / queue +// while True: +// 2. drain_queue() — drain queue → nodes needing lookup +// 3. — distributed RPC fetch (stays in Python) +// 4. push_residuals(fetched_by_etype_id) — push residuals, update queue +// 5. extract_top_k(max_ppr_nodes) — top-k selection per seed per node type +class PPRForwardPushState { + public: + PPRForwardPushState(torch::Tensor seed_nodes, int32_t seed_node_type_id, double alpha, + double requeue_threshold_factor, + std::vector> node_type_to_edge_type_ids, + std::vector edge_type_to_dst_ntype_id, + std::vector degree_tensors); + + // Drain all queued nodes and return {etype_id: tensor[node_ids]} for batch + // neighbor lookup. Also snapshots the drained nodes into queued_nodes_ for + // use by push_residuals(). + // + // Return value semantics: + // - std::nullopt → queue was already empty; convergence achieved; stop the loop. + // - empty map → nodes were drained but all were cached; call push_residuals({}). + // - non-empty map → {etype_id → 1-D int64 tensor of node IDs} needing neighbor lookup. + std::optional> drain_queue(); + + // Push residuals to neighbors given the fetched neighbor data. + // + // fetched_by_etype_id: {etype_id: (node_ids_tensor, flat_nbrs_tensor, counts_tensor)} + // - node_ids_tensor: [N] int64 — source node IDs fetched for this edge type + // - flat_nbrs_tensor: [sum(counts)] int64 — all neighbor lists concatenated flat + // - counts_tensor: [N] int64 — neighbor count for each source node + void push_residuals(const std::unordered_map< + int32_t, std::tuple>& + fetched_by_etype_id); + + // Extract top-k PPR nodes per seed per node type. + // + // Returns {ntype_id: (flat_ids_tensor, flat_weights_tensor, valid_counts_tensor)}. + // Only node types that received any PPR score are included in the output. + // + // Output layout for a batch of B seeds: + // flat_ids[0 : valid_counts[0]] → top-k nodes for seed 0 + // flat_ids[valid_counts[0] : valid_counts[0]+valid_counts[1]] → top-k for seed 1 + // ... + std::unordered_map> + extract_top_k(int32_t max_ppr_nodes); + + private: + // Look up the total (across all edge types) out-degree of a node. + // Returns 0 for destination-only node types (no outgoing edges). + int32_t get_total_degree(int32_t node_id, int32_t ntype_id) const; + + // ------------------------------------------------------------------------- + // Scalar algorithm parameters + // ------------------------------------------------------------------------- + double alpha_; // Restart probability + double one_minus_alpha_; // 1 - alpha, precomputed to avoid repeated subtraction + double requeue_threshold_factor_; // alpha * eps; multiplied by degree to get per-node threshold + + int32_t batch_size_; // Number of seeds in the current batch + int32_t num_node_types_; // Total number of node types (homo + hetero) + int32_t num_nodes_in_queue_{0}; // Running count of nodes across all seeds / types + + // ------------------------------------------------------------------------- + // Graph structure (read-only after construction) + // ------------------------------------------------------------------------- + std::vector> node_type_to_edge_type_ids_; + std::vector edge_type_to_dst_ntype_id_; + std::vector degree_tensors_; + + // ------------------------------------------------------------------------- + // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id]) + // ------------------------------------------------------------------------- + std::vector>> ppr_scores_; + std::vector>> residuals_; + std::vector>> queue_; + std::vector>> queued_nodes_; + + // ------------------------------------------------------------------------- + // Neighbor cache + // ------------------------------------------------------------------------- + std::unordered_map> neighbor_cache_; +}; diff --git a/gigl/distributed/cpp_extensions/ppr_forward_push.pyi b/gigl/csrc/distributed/ppr_forward_push.pyi similarity index 100% rename from gigl/distributed/cpp_extensions/ppr_forward_push.pyi rename to gigl/csrc/distributed/ppr_forward_push.pyi diff --git a/gigl/csrc/distributed/python_ppr_forward_push.cpp b/gigl/csrc/distributed/python_ppr_forward_push.cpp new file mode 100644 index 000000000..ebf3fa27a --- /dev/null +++ b/gigl/csrc/distributed/python_ppr_forward_push.cpp @@ -0,0 +1,63 @@ +// Python bindings for PPRForwardPushState. +// +// Follows PyTorch's csrc convention: pure C++ algorithm lives in +// ppr_forward_push.{h,cpp}; this file only handles type conversion between +// Python (pybind11) and C++ types, then delegates to the C++ implementation. + +#include +#include + +#include "ppr_forward_push.h" + +namespace py = pybind11; + +// drain_queue: C++ returns std::optional>. +// Exposed to Python as: None (convergence) or dict[int, Tensor]. +static py::object drain_queue_wrapper(PPRForwardPushState& self) { + auto result = self.drain_queue(); + if (!result) { + return py::none(); + } + py::dict d; + for (auto& [eid, tensor] : *result) { + d[py::int_(eid)] = tensor; + } + return d; +} + +// push_residuals: Python passes dict[int, tuple[Tensor, Tensor, Tensor]]. +// Convert to C++ map before delegating. +static void push_residuals_wrapper(PPRForwardPushState& self, py::dict fetched_by_etype_id) { + std::unordered_map> cpp_map; + for (auto item : fetched_by_etype_id) { + int32_t eid = item.first.cast(); + auto tup = item.second.cast(); + cpp_map[eid] = {tup[0].cast(), tup[1].cast(), + tup[2].cast()}; + } + self.push_residuals(cpp_map); +} + +// extract_top_k: C++ returns map>. +// Exposed to Python as dict[int, tuple[Tensor, Tensor, Tensor]]. +static py::dict extract_top_k_wrapper(PPRForwardPushState& self, int32_t max_ppr_nodes) { + auto result = self.extract_top_k(max_ppr_nodes); + py::dict d; + for (auto& [nt, tup] : result) { + d[py::int_(nt)] = + py::make_tuple(std::get<0>(tup), std::get<1>(tup), std::get<2>(tup)); + } + return d; +} + +// TORCH_EXTENSION_NAME is set by PyTorch's build system to match the Python +// module name derived from this file's path (e.g. "ppr_forward_push"). +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + py::class_(m, "PPRForwardPushState") + .def(py::init>, std::vector, + std::vector>()) + .def("drain_queue", drain_queue_wrapper) + .def("push_residuals", push_residuals_wrapper) + .def("extract_top_k", extract_top_k_wrapper); +} diff --git a/gigl/distributed/cpp_extensions/__init__.py b/gigl/distributed/cpp_extensions/__init__.py deleted file mode 100644 index d375f59b1..000000000 --- a/gigl/distributed/cpp_extensions/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -try: - from gigl.distributed.cpp_extensions.ppr_forward_push import PPRForwardPushState -except ImportError as e: - raise ImportError( - "PPR C++ extension not compiled. " - "Run `uv pip install -e .` from the GiGL root to build it." - ) from e - -__all__ = ["PPRForwardPushState"] diff --git a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp b/gigl/distributed/cpp_extensions/ppr_forward_push.cpp deleted file mode 100644 index 0af3eb2b5..000000000 --- a/gigl/distributed/cpp_extensions/ppr_forward_push.cpp +++ /dev/null @@ -1,458 +0,0 @@ -#include // Automatic conversion between C++ containers and Python types -#include // PyTorch C++ API (tensors, TORCH_CHECK) - -#include // std::partial_sort, std::min -#include // Fixed-width integer types: int32_t, int64_t, uint32_t, uint64_t -#include // std::unordered_map — like Python dict, O(1) average lookup -#include // std::unordered_set — like Python set, O(1) average lookup -#include // std::vector — like Python list, contiguous in memory - -namespace py = pybind11; // Alias for the pybind11 namespace (bridges C++ ↔ Python) - -// Combine (node_id, etype_id) into a single 64-bit integer for use as a hash -// map key. A single 64-bit integer is cheaper to hash than a pair of two -// integers (std::unordered_map has no built-in pair hash). -// -// Bit layout: -// bits 63–32: node_id (upper half) -// bits 31– 0: etype_id (lower half) -// -// Both inputs are cast through uint32_t before packing. Without this, a -// negative int32_t (e.g. -1 = 0xFFFFFFFF) would be sign-extended to a full -// 64-bit value, corrupting the upper bits when shifted. Reinterpreting as -// uint32_t first treats the bit pattern as-is (no sign extension). -// -// `static inline` means: define this function here in the translation unit -// (not in a separate object file) and ask the compiler to inline it at each -// call site instead of generating a function call. -static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) { - return (static_cast(static_cast(node_id)) << 32) | - static_cast(etype_id); -} - -// C++ kernel for the PPR Forward Push algorithm (Andersen et al., 2006). -// -// All hot-loop state (scores, residuals, queue, neighbor cache) lives inside -// this object. The distributed neighbor fetch is kept in Python because it -// involves async RPC calls that C++ cannot drive directly. -// -// Owned state: ppr_scores, residuals, queue, queued_nodes, neighbor_cache. -// Python retains ownership of: the distributed neighbor fetch (_batch_fetch_neighbors). -// -// Typical call sequence per batch: -// 1. PPRForwardPushState(seed_nodes, ...) — init per-seed residuals / queue -// while True: -// 2. drain_queue() — drain queue → nodes needing lookup -// 3. — distributed RPC fetch (stays in Python) -// 4. push_residuals(fetched_by_etype_id) — push residuals, update queue -// 5. extract_top_k(max_ppr_nodes) — top-k selection per seed per node type -class PPRForwardPushState { - public: - PPRForwardPushState(torch::Tensor seed_nodes, int32_t seed_node_type_id, double alpha, - double requeue_threshold_factor, - std::vector> node_type_to_edge_type_ids, - std::vector edge_type_to_dst_ntype_id, - std::vector degree_tensors) - : alpha_(alpha), - one_minus_alpha_(1.0 - alpha), - requeue_threshold_factor_(requeue_threshold_factor), - // std::move transfers ownership of each vector into the member variable - // without copying its contents — equivalent to Python's list hand-off - // when you no longer need the original. - node_type_to_edge_type_ids_(std::move(node_type_to_edge_type_ids)), - edge_type_to_dst_ntype_id_(std::move(edge_type_to_dst_ntype_id)), - degree_tensors_(std::move(degree_tensors)) { - TORCH_CHECK(seed_nodes.dim() == 1, "seed_nodes must be 1D"); - batch_size_ = static_cast(seed_nodes.size(0)); - num_node_types_ = static_cast(node_type_to_edge_type_ids_.size()); - - // Allocate per-seed, per-node-type tables. - // .assign(n, val) fills a vector with n copies of val — like [val] * n in Python. - // Each inner element is an empty hash map / hash set for that (seed, ntype) pair. - ppr_scores_.assign(batch_size_, - std::vector>(num_node_types_)); - residuals_.assign(batch_size_, - std::vector>(num_node_types_)); - queue_.assign(batch_size_, std::vector>(num_node_types_)); - queued_nodes_.assign(batch_size_, - std::vector>(num_node_types_)); - - // accessor() returns a typed view into the tensor's data that - // supports [i] indexing with bounds checking in debug builds. Here we read - // each seed node ID from the 1-D int64 tensor. - auto acc = seed_nodes.accessor(); - num_nodes_in_queue_ = batch_size_; - for (int32_t i = 0; i < batch_size_; ++i) { - // static_cast: explicit narrowing from int64 to int32. - // The Python caller guarantees node IDs fit in 32 bits. - int32_t seed = static_cast(acc[i]); - // PPR initialisation: each seed starts with residual = alpha (the - // restart probability). The first push will move alpha into ppr_score - // and distribute (1-alpha)*alpha to the seed's neighbors. - residuals_[i][seed_node_type_id][seed] = alpha_; - queue_[i][seed_node_type_id].insert(seed); - } - } - - // Drain all queued nodes and return {etype_id: tensor[node_ids]} for batch - // neighbor lookup. Also snapshots the drained nodes into queued_nodes_ for - // use by push_residuals(). - // - // Return value semantics (py::object can hold any Python value): - // - py::none() → queue was already empty; convergence achieved; stop the loop. - // - py::dict{} → nodes were drained. The dict maps etype_id → 1-D int64 - // tensor of node IDs that need neighbor lookups this round. - // May be empty if all drained nodes were already in the cache - // or had no outgoing edges — push_residuals must still be called - // to flush their accumulated residual into ppr_scores_. - py::object drain_queue() { - if (num_nodes_in_queue_ == 0) { - return py::none(); - } - - // Reset the snapshot from the previous iteration. `auto&` is a reference - // (alias) to the existing set — clearing it modifies the original in-place - // rather than operating on a copy. - for (int32_t s = 0; s < batch_size_; ++s) - for (auto& qs : queued_nodes_[s]) - qs.clear(); - - // nodes_to_lookup[eid] = set of node IDs that need a neighbor fetch for - // edge type eid this round. Using a set deduplicates nodes that appear - // in multiple seeds' queues: we only fetch each (node, etype) pair once - // regardless of how many seeds need it. - std::unordered_map> nodes_to_lookup; - - for (int32_t s = 0; s < batch_size_; ++s) { - for (int32_t nt = 0; nt < num_node_types_; ++nt) { - if (queue_[s][nt].empty()) - continue; - - // Move the live queue into the snapshot (no data copy — O(1)). - // queue_ is then reset to an empty set so new entries added by - // push_residuals() in this same iteration don't interfere. - queued_nodes_[s][nt] = std::move(queue_[s][nt]); - queue_[s][nt].clear(); - num_nodes_in_queue_ -= static_cast(queued_nodes_[s][nt].size()); - - for (int32_t node_id : queued_nodes_[s][nt]) { - for (int32_t eid : node_type_to_edge_type_ids_[nt]) { - // Only request a fetch if the neighbor list isn't already - // cached from a previous iteration. - if (neighbor_cache_.find(pack_key(node_id, eid)) == neighbor_cache_.end()) { - nodes_to_lookup[eid].insert(node_id); - } - } - } - } - } - - // Convert to Python: {etype_id (int) → 1-D int64 tensor of node IDs}. - // py::int_(eid) wraps a C++ int as a Python int so it can be used as a - // dict key on the Python side. - py::dict result; - for (auto& [eid, node_set] : nodes_to_lookup) { - // Copy the set into a vector first: torch::tensor() requires a - // contiguous sequence, not an unordered_set iterator. - std::vector ids(node_set.begin(), node_set.end()); - result[py::int_(eid)] = torch::tensor(ids, torch::kLong); - } - return result; - } - - // Push residuals to neighbors given the fetched neighbor data. - // - // fetched_by_etype_id: {etype_id: (node_ids_tensor, flat_nbrs_tensor, counts_tensor)} - // - node_ids_tensor: [N] int64 — source node IDs fetched for this edge type - // - flat_nbrs_tensor: [sum(counts)] int64 — all neighbor lists concatenated flat - // - counts_tensor: [N] int64 — neighbor count for each source node - // - // For example, if nodes 3 and 7 were fetched for etype 0: - // node_ids = [3, 7] - // flat_nbrs = [10, 11, 12, 20] ← node 3 has nbrs {10,11,12}, node 7 has nbr {20} - // counts = [3, 1] - void push_residuals(py::dict fetched_by_etype_id) { - // Step 1: Unpack the Python dict into a C++ map for fast lookup during - // the residual-push loop below. - // fetched: pack_key(node_id, etype_id) → neighbor list (as int32_t vector) - std::unordered_map> fetched; - for (auto item : fetched_by_etype_id) { - int32_t eid = item.first.cast(); - // .cast() interprets the Python value as a tuple so we - // can index into it with [0], [1], [2]. - auto tup = item.second.cast(); - auto node_ids_t = tup[0].cast(); - auto flat_nbrs_t = tup[1].cast(); - auto counts_t = tup[2].cast(); - - // accessor() gives a bounds-checked, typed 1-D view into - // each tensor's data — equivalent to iterating over a NumPy array. - auto node_acc = node_ids_t.accessor(); - auto nbr_acc = flat_nbrs_t.accessor(); - auto cnt_acc = counts_t.accessor(); - - // Walk the flat neighbor list, slicing out each node's neighbors using - // the running offset into the concatenated flat buffer. - int64_t offset = 0; - for (int64_t i = 0; i < node_ids_t.size(0); ++i) { - int32_t nid = static_cast(node_acc[i]); - int64_t count = cnt_acc[i]; - std::vector nbrs(count); - for (int64_t j = 0; j < count; ++j) - nbrs[j] = static_cast(nbr_acc[offset + j]); - // std::move: hand off nbrs to the map without copying its contents. - fetched[pack_key(nid, eid)] = std::move(nbrs); - offset += count; - } - } - - // Step 2: For every node that was in the queue (captured in queued_nodes_ - // by drain_queue()), apply one PPR push step: - // a. Absorb residual into the PPR score. - // b. Distribute (1-alpha) * residual equally to each neighbor. - // c. Enqueue any neighbor whose residual now exceeds the requeue threshold. - for (int32_t s = 0; s < batch_size_; ++s) { - for (int32_t nt = 0; nt < num_node_types_; ++nt) { - if (queued_nodes_[s][nt].empty()) - continue; - - for (int32_t src : queued_nodes_[s][nt]) { - // `auto&` gives a reference to the residual map for this - // (seed, node_type) pair so we can read and write it without - // an extra hash lookup each time. - auto& src_res = residuals_[s][nt]; - // .find() returns an iterator; .end() means "not found". - // We treat a missing entry as residual = 0. - auto it = src_res.find(src); - double res = (it != src_res.end()) ? it->second : 0.0; - - // a. Absorb: move residual into the PPR score. - ppr_scores_[s][nt][src] += res; - src_res[src] = 0.0; - - int32_t total_deg = get_total_degree(src, nt); - // Destination-only nodes (no outgoing edges) absorb residual - // into their PPR score but do not push further. - if (total_deg == 0) - continue; - - // b. Distribute: each neighbor of src (across all edge types - // from nt) receives an equal share of the pushed residual. - double res_per_nbr = one_minus_alpha_ * res / static_cast(total_deg); - - for (int32_t eid : node_type_to_edge_type_ids_[nt]) { - // Invariant: fetched and neighbor_cache_ are mutually exclusive for - // any given (node, etype) key within one iteration. drain_queue() - // only requests a fetch for nodes absent from neighbor_cache_, so a - // key is in at most one of the two. We check fetched first since it - // is the common case for newly-seen nodes. - // - // `const std::vector*` is a pointer to a neighbor list. - // We use a pointer (rather than copying the list) so we can check - // for absence with nullptr without allocating anything. - const std::vector* nbr_list = nullptr; - auto fi = fetched.find(pack_key(src, eid)); - if (fi != fetched.end()) { - // `&fi->second` takes the address of the vector stored in - // the map — nbr_list now points to it without copying. - nbr_list = &fi->second; - } else { - auto ci = neighbor_cache_.find(pack_key(src, eid)); - if (ci != neighbor_cache_.end()) - nbr_list = &ci->second; - } - // Skip if no neighbor list is available (node has no edges of - // this type, or the fetch returned an empty list). - if (!nbr_list || nbr_list->empty()) - continue; - - int32_t dst_nt = edge_type_to_dst_ntype_id_[eid]; - - // c. For each neighbor, accumulate residual and check threshold. - // `*nbr_list` dereferences the pointer to access the vector. - for (int32_t nbr : *nbr_list) { - residuals_[s][dst_nt][nbr] += res_per_nbr; - - double threshold = requeue_threshold_factor_ * - static_cast(get_total_degree(nbr, dst_nt)); - - // Only enqueue if: (1) not already in queue for this - // iteration, and (2) residual exceeds the push threshold - // alpha * eps * degree. - if (queue_[s][dst_nt].find(nbr) == queue_[s][dst_nt].end() && - residuals_[s][dst_nt][nbr] >= threshold) { - queue_[s][dst_nt].insert(nbr); - ++num_nodes_in_queue_; // ++x is equivalent to x += 1 - - // Promote this node's neighbor lists to the persistent cache: - // it will be processed next iteration, so caching now avoids - // a re-fetch. Nodes that are never requeued (typically - // high-degree) are never promoted, keeping their large neighbor - // lists out of the cache. - for (int32_t peid : node_type_to_edge_type_ids_[dst_nt]) { - uint64_t pk = pack_key(nbr, peid); - if (neighbor_cache_.find(pk) == neighbor_cache_.end()) { - auto pfi = fetched.find(pk); - if (pfi != fetched.end()) - neighbor_cache_[pk] = pfi->second; - } - } - } - } - } - } - } - } - } - - // Extract top-k PPR nodes per seed per node type. - // - // Returns {ntype_id: (flat_ids_tensor, flat_weights_tensor, valid_counts_tensor)}. - // Only node types that received any PPR score are included in the output. - // - // Output layout for a batch of B seeds (same structure as _batch_fetch_neighbors): - // flat_ids[0 : valid_counts[0]] → top-k nodes for seed 0 - // flat_ids[valid_counts[0] : valid_counts[0]+valid_counts[1]] → top-k for seed 1 - // ... - py::dict extract_top_k(int32_t max_ppr_nodes) { - // Collect node types that have any PPR score — skip types with no activity. - std::unordered_set active; - for (int32_t s = 0; s < batch_size_; ++s) - for (int32_t nt = 0; nt < num_node_types_; ++nt) - if (!ppr_scores_[s][nt].empty()) - active.insert(nt); - - py::dict result; - for (int32_t nt : active) { - // Flat output vectors — entries for all seeds are concatenated. - std::vector flat_ids; - std::vector flat_weights; - std::vector valid_counts; - - for (int32_t s = 0; s < batch_size_; ++s) { - // `const auto&` is a read-only reference — we iterate the map - // without copying it. - const auto& scores = ppr_scores_[s][nt]; - // Cap k at the number of nodes that actually have a score. - int32_t k = std::min(max_ppr_nodes, static_cast(scores.size())); - if (k > 0) { - // Copy the map entries into a vector of (node_id, score) pairs - // so they can be sorted. std::pair is like a Python 2-tuple. - std::vector> items(scores.begin(), scores.end()); - - // std::partial_sort rearranges items so that the first k entries - // are the k largest — like Python's heapq.nlargest but in-place. - // The lambda `[](const auto& a, const auto& b) { return ...; }` - // is an anonymous comparator (like Python's `key=` argument). - // `.second` accesses the score (second element of the pair); - // `>` makes it descending (highest score first). - std::partial_sort( - items.begin(), items.begin() + k, items.end(), - [](const auto& a, const auto& b) { return a.second > b.second; }); - - for (int32_t i = 0; i < k; ++i) { - flat_ids.push_back(static_cast(items[i].first)); - // Cast to float32 for output; internal scores stay double to - // avoid accumulated rounding errors in the push loop above. - flat_weights.push_back(static_cast(items[i].second)); - } - } - valid_counts.push_back(static_cast(k)); - } - - // py::make_tuple wraps C++ values into a Python tuple. - result[py::int_(nt)] = py::make_tuple(torch::tensor(flat_ids, torch::kLong), - torch::tensor(flat_weights, torch::kFloat), - torch::tensor(valid_counts, torch::kLong)); - } - return result; - } - - private: - // Look up the total (across all edge types) out-degree of a node. - // Returns 0 for destination-only node types (no outgoing edges). - int32_t get_total_degree(int32_t node_id, int32_t ntype_id) const { - if (ntype_id >= static_cast(degree_tensors_.size())) - return 0; - const auto& t = degree_tensors_[ntype_id]; - if (t.numel() == 0) - return 0; // destination-only type: no outgoing edges - TORCH_CHECK(node_id < static_cast(t.size(0)), "Node ID ", node_id, - " out of range for degree tensor of ntype_id ", ntype_id, " (size=", t.size(0), - "). This indicates corrupted graph data or a sampler bug."); - // data_ptr() returns a raw C pointer to the tensor's int32 data - // buffer. Direct pointer indexing ([node_id]) is safe here because we - // validated the bounds with TORCH_CHECK above. - return t.data_ptr()[node_id]; - } - - // ------------------------------------------------------------------------- - // Scalar algorithm parameters - // ------------------------------------------------------------------------- - double alpha_; // Restart probability - double one_minus_alpha_; // 1 - alpha, precomputed to avoid repeated subtraction - double - requeue_threshold_factor_; // alpha * eps; multiplied by degree to get per-node threshold - - int32_t batch_size_; // Number of seeds in the current batch - int32_t num_node_types_; // Total number of node types (homo + hetero) - int32_t num_nodes_in_queue_{0}; // Running count of nodes across all seeds / types - - // ------------------------------------------------------------------------- - // Graph structure (read-only after construction) - // ------------------------------------------------------------------------- - // node_type_to_edge_type_ids_[ntype_id] → list of edge type IDs that can be - // traversed from that node type (outgoing or incoming, depending on edge_dir). - std::vector> node_type_to_edge_type_ids_; - // edge_type_to_dst_ntype_id_[eid] → node type ID at the destination end. - std::vector edge_type_to_dst_ntype_id_; - // degree_tensors_[ntype_id][node_id] → total degree of that node across all - // edge types traversable from its type. Empty tensor means no outgoing edges. - std::vector degree_tensors_; - - // ------------------------------------------------------------------------- - // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id]) - // ------------------------------------------------------------------------- - // double precision avoids float32 rounding errors accumulating over 20-30 - // push iterations, which would otherwise cause ~1e-4 score errors vs the - // true PPR. Output weights are cast to float32 in extract_top_k. - // - // ppr_scores_[s][nt]: node_id → absorbed PPR score (Σ of residuals pushed so far) - std::vector>> ppr_scores_; - // residuals_[s][nt]: node_id → unabsorbed probability mass waiting to be pushed - std::vector>> residuals_; - // queue_[s][nt]: nodes whose residual exceeds the threshold and need a push next round - std::vector>> queue_; - // queued_nodes_[s][nt]: snapshot of queue_ taken by drain_queue() for the current round. - // Separating it from queue_ lets push_residuals() enqueue new nodes into queue_ without - // modifying the set currently being iterated. - std::vector>> queued_nodes_; - - // ------------------------------------------------------------------------- - // Neighbor cache - // ------------------------------------------------------------------------- - // Persistent cache: pack_key(node_id, etype_id) → neighbor list. - // Only nodes that have been re-queued (and will therefore be processed again) - // are promoted here from the per-iteration fetched map in push_residuals(). - // This avoids re-fetching neighbors for nodes processed in multiple iterations - // while keeping large neighbor lists of high-degree (never-requeued) nodes - // out of memory. - std::unordered_map> neighbor_cache_; -}; - -// Register PPRForwardPushState with Python via pybind11. -// -// TORCH_EXTENSION_NAME is set by PyTorch's setup() at build time to match the -// Python module name (e.g. "ppr_forward_push"). At import time, Python calls -// this function to populate the module with the C++ class. -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - py::class_(m, "PPRForwardPushState") - // .def(py::init<...>()) exposes the constructor. The template arguments - // list the exact C++ parameter types so pybind11 can convert Python - // arguments to the correct C++ types automatically. - .def(py::init>, - std::vector, std::vector>()) - .def("drain_queue", &PPRForwardPushState::drain_queue) - .def("push_residuals", &PPRForwardPushState::push_residuals) - .def("extract_top_k", &PPRForwardPushState::extract_top_k); -} diff --git a/gigl/distributed/dist_ppr_sampler.py b/gigl/distributed/dist_ppr_sampler.py index b63329357..6285d67e6 100644 --- a/gigl/distributed/dist_ppr_sampler.py +++ b/gigl/distributed/dist_ppr_sampler.py @@ -14,7 +14,7 @@ from graphlearn_torch.typing import EdgeType, NodeType from graphlearn_torch.utils import merge_dict -from gigl.distributed.cpp_extensions import PPRForwardPushState +from gigl.csrc.distributed import PPRForwardPushState from gigl.distributed.dist_neighbor_sampler import DistNeighborSampler from gigl.types.graph import is_label_edge_type From c66a6e53a6e7cda2564394a2ffe61e59cc962125 Mon Sep 17 00:00:00 2001 From: mkolodner Date: Wed, 25 Mar 2026 22:21:38 +0000 Subject: [PATCH 07/14] Update --- gigl/csrc/{distributed => sampling}/__init__.py | 2 +- gigl/csrc/{distributed => sampling}/ppr_forward_push.cpp | 0 gigl/csrc/{distributed => sampling}/ppr_forward_push.h | 0 gigl/csrc/{distributed => sampling}/ppr_forward_push.pyi | 0 gigl/csrc/{distributed => sampling}/python_ppr_forward_push.cpp | 0 gigl/distributed/dist_ppr_sampler.py | 2 +- 6 files changed, 2 insertions(+), 2 deletions(-) rename gigl/csrc/{distributed => sampling}/__init__.py (74%) rename gigl/csrc/{distributed => sampling}/ppr_forward_push.cpp (100%) rename gigl/csrc/{distributed => sampling}/ppr_forward_push.h (100%) rename gigl/csrc/{distributed => sampling}/ppr_forward_push.pyi (100%) rename gigl/csrc/{distributed => sampling}/python_ppr_forward_push.cpp (100%) diff --git a/gigl/csrc/distributed/__init__.py b/gigl/csrc/sampling/__init__.py similarity index 74% rename from gigl/csrc/distributed/__init__.py rename to gigl/csrc/sampling/__init__.py index d8ffa921a..b2e23ba6c 100644 --- a/gigl/csrc/distributed/__init__.py +++ b/gigl/csrc/sampling/__init__.py @@ -1,5 +1,5 @@ try: - from gigl.csrc.distributed.ppr_forward_push import PPRForwardPushState + from gigl.csrc.sampling.ppr_forward_push import PPRForwardPushState except ImportError as e: raise ImportError( "PPR C++ extension not compiled. " diff --git a/gigl/csrc/distributed/ppr_forward_push.cpp b/gigl/csrc/sampling/ppr_forward_push.cpp similarity index 100% rename from gigl/csrc/distributed/ppr_forward_push.cpp rename to gigl/csrc/sampling/ppr_forward_push.cpp diff --git a/gigl/csrc/distributed/ppr_forward_push.h b/gigl/csrc/sampling/ppr_forward_push.h similarity index 100% rename from gigl/csrc/distributed/ppr_forward_push.h rename to gigl/csrc/sampling/ppr_forward_push.h diff --git a/gigl/csrc/distributed/ppr_forward_push.pyi b/gigl/csrc/sampling/ppr_forward_push.pyi similarity index 100% rename from gigl/csrc/distributed/ppr_forward_push.pyi rename to gigl/csrc/sampling/ppr_forward_push.pyi diff --git a/gigl/csrc/distributed/python_ppr_forward_push.cpp b/gigl/csrc/sampling/python_ppr_forward_push.cpp similarity index 100% rename from gigl/csrc/distributed/python_ppr_forward_push.cpp rename to gigl/csrc/sampling/python_ppr_forward_push.cpp diff --git a/gigl/distributed/dist_ppr_sampler.py b/gigl/distributed/dist_ppr_sampler.py index 6285d67e6..9e8c8a482 100644 --- a/gigl/distributed/dist_ppr_sampler.py +++ b/gigl/distributed/dist_ppr_sampler.py @@ -14,7 +14,7 @@ from graphlearn_torch.typing import EdgeType, NodeType from graphlearn_torch.utils import merge_dict -from gigl.csrc.distributed import PPRForwardPushState +from gigl.csrc.sampling import PPRForwardPushState from gigl.distributed.dist_neighbor_sampler import DistNeighborSampler from gigl.types.graph import is_label_edge_type From 6e63172a3fe10155358d790710df4e3be964161c Mon Sep 17 00:00:00 2001 From: mkolodner Date: Wed, 1 Apr 2026 22:16:18 +0000 Subject: [PATCH 08/14] Update --- gigl/distributed/dist_ppr_sampler.py | 123 +++++++++++++++++++++------ gigl/distributed/sampler_options.py | 2 +- 2 files changed, 96 insertions(+), 29 deletions(-) diff --git a/gigl/distributed/dist_ppr_sampler.py b/gigl/distributed/dist_ppr_sampler.py index 9e8c8a482..23c9d3c80 100644 --- a/gigl/distributed/dist_ppr_sampler.py +++ b/gigl/distributed/dist_ppr_sampler.py @@ -1,6 +1,5 @@ -# TODO (mkolodner-sc): Investigate whether concurrency for _sample_one_hop and _compute_ppr_scores will -# yield performance benefits. - +import asyncio +import time from collections import defaultdict from typing import Optional, Union @@ -24,6 +23,9 @@ # yield a bare EdgeType repr for ast.literal_eval). PPR_EDGE_INDEX_METADATA_KEY = "ppr_edge_index." PPR_WEIGHT_METADATA_KEY = "ppr_weight." +PPR_FETCH_TIME_MS_METADATA_KEY = "ppr_fetch_time_ms" +PPR_PUSH_TIME_MS_METADATA_KEY = "ppr_push_time_ms" +PPR_ITERATIONS_METADATA_KEY = "ppr_iterations" # Sentinel type names for homogeneous graphs. The PPR algorithm uses # dict[NodeType, ...] internally for both homo and hetero graphs; these @@ -292,18 +294,29 @@ async def _batch_fetch_neighbors( 5: (tensor([7]), tensor([0, 3]), tensor([2])), } """ - result: dict[int, tuple[torch.Tensor, torch.Tensor, torch.Tensor]] = {} - for eid, node_ids_tensor in nodes_by_etype_id.items(): - etype = self._etype_id_to_etype[eid] - # _sample_one_hop expects None for homogeneous graphs, not the PPR sentinel. - output: NeighborOutput = await self._sample_one_hop( - srcs=node_ids_tensor.to(device), - num_nbr=self._num_neighbors_per_hop, - etype=etype if etype != _PPR_HOMOGENEOUS_EDGE_TYPE else None, - ) - result[eid] = (node_ids_tensor, output.nbr, output.nbr_num) - - return result + # Fire all per-edge-type RPC calls concurrently. Each _sample_one_hop + # issues a single RPC round-trip; doing them in parallel rather than + # sequentially cuts fetch latency from O(num_edge_types) to O(1). + eids = list(nodes_by_etype_id.keys()) + outputs: list[NeighborOutput] = await asyncio.gather( + *[ + self._sample_one_hop( + srcs=nodes_by_etype_id[eid].to(device), + num_nbr=self._num_neighbors_per_hop, + # _sample_one_hop expects None for homogeneous graphs, not the PPR sentinel. + etype=( + self._etype_id_to_etype[eid] + if self._etype_id_to_etype[eid] != _PPR_HOMOGENEOUS_EDGE_TYPE + else None + ), + ) + for eid in eids + ] + ) + return { + eid: (nodes_by_etype_id[eid], output.nbr, output.nbr_num) + for eid, output in zip(eids, outputs) + } async def _compute_ppr_scores( self, @@ -313,6 +326,7 @@ async def _compute_ppr_scores( Union[torch.Tensor, dict[NodeType, torch.Tensor]], Union[torch.Tensor, dict[NodeType, torch.Tensor]], Union[torch.Tensor, dict[NodeType, torch.Tensor]], + tuple[float, float, int], ]: """ Compute PPR scores for seed nodes using the push-based approximation algorithm. @@ -354,6 +368,9 @@ async def _compute_ppr_scores( seed, shape ``[batch_size]``. Used to slice the flat tensors into per-seed groups: seed ``i``'s neighbors are at ``flat_neighbor_ids[sum(valid_counts[:i]) : sum(valid_counts[:i+1])]``. + - timing: ``(fetch_ms, push_ms, iterations)`` — wall-clock time + spent in neighbor fetch (ms), residual push (ms), and total loop + iteration count for this call. Example:: @@ -376,6 +393,10 @@ async def _compute_ppr_scores( self._degree_tensors_for_cpp, ) + total_fetch_ms = 0.0 + total_push_ms = 0.0 + total_iterations = 0 + while True: # drain_queue returns None when the queue is truly empty (convergence), # or a dict (possibly empty) when nodes were drained. An empty dict @@ -388,13 +409,18 @@ async def _compute_ppr_scores( nodes_by_etype_id: dict[int, torch.Tensor] = drain_result if nodes_by_etype_id: + fetch_start = time.perf_counter() fetched_by_etype_id = await self._batch_fetch_neighbors( nodes_by_etype_id, device ) + total_fetch_ms += (time.perf_counter() - fetch_start) * 1000 else: fetched_by_etype_id = {} + push_start = time.perf_counter() ppr_state.push_residuals(fetched_by_etype_id) + total_push_ms += (time.perf_counter() - push_start) * 1000 + total_iterations += 1 # Translate ntype_id integer keys back to NodeType strings for the rest # of the pipeline, and move tensors to the correct device. @@ -410,6 +436,7 @@ async def _compute_ppr_scores( ntype_to_flat_weights[ntype] = flat_weights.to(device) ntype_to_valid_counts[ntype] = valid_counts.to(device) + timing = (total_fetch_ms, total_push_ms, total_iterations) if self._is_homogeneous: assert ( len(ntype_to_flat_ids) == 1 @@ -419,9 +446,15 @@ async def _compute_ppr_scores( ntype_to_flat_ids[_PPR_HOMOGENEOUS_NODE_TYPE], ntype_to_flat_weights[_PPR_HOMOGENEOUS_NODE_TYPE], ntype_to_valid_counts[_PPR_HOMOGENEOUS_NODE_TYPE], + timing, ) else: - return ntype_to_flat_ids, ntype_to_flat_weights, ntype_to_valid_counts + return ( + ntype_to_flat_ids, + ntype_to_flat_weights, + ntype_to_valid_counts, + timing, + ) async def _sample_from_nodes( self, @@ -508,20 +541,42 @@ async def _sample_from_nodes( # NodeType -> global IDs (same values as nodes_to_sample). src_dict = inducer.init_node(nodes_to_sample) - # Compute PPR for each seed type, collecting flat global neighbor IDs, - # weights, and per-seed counts. Build nbr_dict for a single - # inducer.induce_next call using PPR edge types (seed_type, 'ppr', ntype) - # — the inducer only cares about etype[0] and etype[-1] as source/dest - # node types, so the relation name is arbitrary. + # Compute PPR for all seed types concurrently, collecting flat global + # neighbor IDs, weights, and per-seed counts. Build nbr_dict for a + # single inducer.induce_next call using PPR edge types + # (seed_type, 'ppr', ntype) — the inducer only cares about etype[0] + # and etype[-1] as source/dest node types, so the relation name is + # arbitrary. + # + # Each seed type's PPR computation is entirely independent: it creates + # its own PPRForwardPushState and only reads shared sampler attributes + # (degree tensors, edge-type maps) which are immutable after __init__. + # Running them with asyncio.gather allows their fetch phases to overlap, + # which is most beneficial when there are 2+ distinct seed node types + # (e.g. cross-type supervision edges like user→story). + seed_types = list(nodes_to_sample.keys()) + ppr_results = await asyncio.gather( + *[ + self._compute_ppr_scores(nodes_to_sample[seed_type], seed_type) + for seed_type in seed_types + ] + ) + nbr_dict: dict[EdgeType, list[torch.Tensor]] = {} ppr_edge_type_to_flat_weights: dict[EdgeType, torch.Tensor] = {} - - for seed_type, seed_nodes in nodes_to_sample.items(): - ( - ntype_to_flat_ids, - ntype_to_flat_weights, - ntype_to_valid_counts, - ) = await self._compute_ppr_scores(seed_nodes, seed_type) + total_fetch_ms = 0.0 + total_push_ms = 0.0 + total_iterations = 0 + + for seed_type, ( + ntype_to_flat_ids, + ntype_to_flat_weights, + ntype_to_valid_counts, + (fetch_ms, push_ms, iterations), + ) in zip(seed_types, ppr_results): + total_fetch_ms += fetch_ms + total_push_ms += push_ms + total_iterations += iterations assert isinstance(ntype_to_flat_ids, dict) assert isinstance(ntype_to_flat_weights, dict) assert isinstance(ntype_to_valid_counts, dict) @@ -583,6 +638,12 @@ async def _sample_from_nodes( metadata[f"{PPR_EDGE_INDEX_METADATA_KEY}{etype_str}"] = edge_index metadata[f"{PPR_WEIGHT_METADATA_KEY}{etype_str}"] = flat_weights + metadata[PPR_FETCH_TIME_MS_METADATA_KEY] = torch.tensor(total_fetch_ms) + metadata[PPR_PUSH_TIME_MS_METADATA_KEY] = torch.tensor(total_push_ms) + metadata[PPR_ITERATIONS_METADATA_KEY] = torch.tensor( + total_iterations, dtype=torch.long + ) + sample_output = HeteroSamplerOutput( node=node_dict, # row/col/edge are left empty rather than populated with PPR edges because @@ -613,6 +674,7 @@ async def _sample_from_nodes( homo_flat_ids, homo_flat_weights, homo_valid_counts, + (total_fetch_ms, total_push_ms, total_iterations), ) = await self._compute_ppr_scores(nodes_to_sample, None) assert isinstance(homo_flat_ids, torch.Tensor) assert isinstance(homo_flat_weights, torch.Tensor) @@ -634,6 +696,11 @@ async def _sample_from_nodes( metadata["edge_index"] = ppr_edge_index metadata["edge_attr"] = homo_flat_weights + metadata[PPR_FETCH_TIME_MS_METADATA_KEY] = torch.tensor(total_fetch_ms) + metadata[PPR_PUSH_TIME_MS_METADATA_KEY] = torch.tensor(total_push_ms) + metadata[PPR_ITERATIONS_METADATA_KEY] = torch.tensor( + total_iterations, dtype=torch.long + ) sample_output = SamplerOutput( node=all_nodes, diff --git a/gigl/distributed/sampler_options.py b/gigl/distributed/sampler_options.py index d87a83d52..eccdd70f7 100644 --- a/gigl/distributed/sampler_options.py +++ b/gigl/distributed/sampler_options.py @@ -64,7 +64,7 @@ class PPRSamplerOptions: alpha: float = 0.5 eps: float = 1e-4 max_ppr_nodes: int = 50 - num_neighbors_per_hop: int = 100_000 + num_neighbors_per_hop: int = 1_000 total_degree_dtype: torch.dtype = torch.int32 From c16dd9d38300d30a542407d659df236644ac2d24 Mon Sep 17 00:00:00 2001 From: mkolodner Date: Wed, 1 Apr 2026 23:30:23 +0000 Subject: [PATCH 09/14] Fix type check and remove unused etypes from num_sampled_edges --- gigl/distributed/utils/neighborloader.py | 1 + scripts/build_cpp_extensions.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/gigl/distributed/utils/neighborloader.py b/gigl/distributed/utils/neighborloader.py index 974d12cf6..d0d96638f 100644 --- a/gigl/distributed/utils/neighborloader.py +++ b/gigl/distributed/utils/neighborloader.py @@ -189,6 +189,7 @@ def strip_non_ppr_edge_types( for edge_type in list(data.edge_types): if edge_type not in ppr_edge_types: del data[edge_type] + data.num_sampled_edges.pop(edge_type, None) return data diff --git a/scripts/build_cpp_extensions.py b/scripts/build_cpp_extensions.py index d05740860..f99e47f9b 100644 --- a/scripts/build_cpp_extensions.py +++ b/scripts/build_cpp_extensions.py @@ -10,13 +10,13 @@ from pathlib import Path -from setuptools import setup +from setuptools import Extension, setup from torch.utils.cpp_extension import BuildExtension, CppExtension _CSRC_DIR = Path("gigl/csrc") -def find_cpp_extensions() -> list[CppExtension]: +def find_cpp_extensions() -> list[Extension]: """Auto-discover pybind11 extension modules under ``gigl/csrc/``. Following PyTorch's csrc convention, only files named ``python_*.cpp`` are From d651f41f6f05028c5294ffdbb9d0d548056d3095 Mon Sep 17 00:00:00 2001 From: mkolodner Date: Mon, 6 Apr 2026 23:29:52 +0000 Subject: [PATCH 10/14] fetched-count normalization, nodes_drained diagnostic, max_fetch_iterations, per-iteration timing metadata --- gigl/csrc/sampling/ppr_forward_push.cpp | 38 ++++- gigl/csrc/sampling/ppr_forward_push.h | 11 ++ gigl/csrc/sampling/ppr_forward_push.pyi | 1 + .../csrc/sampling/python_ppr_forward_push.cpp | 3 +- gigl/distributed/dist_ppr_sampler.py | 134 ++++++++++++++++-- gigl/distributed/dist_sampling_producer.py | 1 + gigl/distributed/sampler_options.py | 7 + 7 files changed, 175 insertions(+), 20 deletions(-) diff --git a/gigl/csrc/sampling/ppr_forward_push.cpp b/gigl/csrc/sampling/ppr_forward_push.cpp index a514907ab..1bbf05dbe 100644 --- a/gigl/csrc/sampling/ppr_forward_push.cpp +++ b/gigl/csrc/sampling/ppr_forward_push.cpp @@ -57,6 +57,7 @@ std::optional> PPRForwardPushState::d // in multiple seeds' queues: we only fetch each (node, etype) pair once. std::unordered_map> nodes_to_lookup; + int32_t total_drained_this_round = 0; for (int32_t s = 0; s < batch_size_; ++s) { for (int32_t nt = 0; nt < num_node_types_; ++nt) { if (queue_[s][nt].empty()) @@ -65,6 +66,7 @@ std::optional> PPRForwardPushState::d // Move the live queue into the snapshot (no data copy — O(1)). queued_nodes_[s][nt] = std::move(queue_[s][nt]); queue_[s][nt].clear(); + total_drained_this_round += static_cast(queued_nodes_[s][nt].size()); num_nodes_in_queue_ -= static_cast(queued_nodes_[s][nt].size()); for (int32_t node_id : queued_nodes_[s][nt]) { @@ -77,6 +79,8 @@ std::optional> PPRForwardPushState::d } } + nodes_drained_per_iteration_.push_back(total_drained_this_round); + std::unordered_map result; for (auto& [eid, node_set] : nodes_to_lookup) { std::vector ids(node_set.begin(), node_set.end()); @@ -85,6 +89,10 @@ std::optional> PPRForwardPushState::d return result; } +const std::vector& PPRForwardPushState::get_nodes_drained_per_iteration() const { + return nodes_drained_per_iteration_; +} + void PPRForwardPushState::push_residuals( const std::unordered_map>& fetched_by_etype_id) { @@ -135,13 +143,28 @@ void PPRForwardPushState::push_residuals( ppr_scores_[s][nt][src] += res; src_res[src] = 0.0; - int32_t total_deg = get_total_degree(src, nt); - // Destination-only nodes absorb residual but do not push further. - if (total_deg == 0) + // b. Count total fetched/cached neighbors across all edge types for + // this source node. We normalise by the number of neighbors we + // actually retrieved, not the true degree, so residual is fully + // distributed among known neighbors rather than leaking to unfetched + // ones (which matters when num_neighbors_per_hop < true_degree). + int32_t total_fetched = 0; + for (int32_t eid : node_type_to_edge_type_ids_[nt]) { + auto fi = fetched.find(pack_key(src, eid)); + if (fi != fetched.end()) { + total_fetched += static_cast(fi->second.size()); + } else { + auto ci = neighbor_cache_.find(pack_key(src, eid)); + if (ci != neighbor_cache_.end()) + total_fetched += static_cast(ci->second.size()); + } + } + // Destination-only nodes (or nodes with no fetched neighbors) absorb + // residual but do not push further. + if (total_fetched == 0) continue; - // b. Distribute: each neighbor receives an equal share. - double res_per_nbr = one_minus_alpha_ * res / static_cast(total_deg); + double res_per_nbr = one_minus_alpha_ * res / static_cast(total_fetched); for (int32_t eid : node_type_to_edge_type_ids_[nt]) { // Invariant: fetched and neighbor_cache_ are mutually exclusive for @@ -167,8 +190,9 @@ void PPRForwardPushState::push_residuals( for (int32_t nbr : *nbr_list) { residuals_[s][dst_nt][nbr] += res_per_nbr; - double threshold = requeue_threshold_factor_ * - static_cast(get_total_degree(nbr, dst_nt)); + double threshold = + requeue_threshold_factor_ * + static_cast(get_total_degree(nbr, dst_nt)); if (queue_[s][dst_nt].find(nbr) == queue_[s][dst_nt].end() && residuals_[s][dst_nt][nbr] >= threshold) { diff --git a/gigl/csrc/sampling/ppr_forward_push.h b/gigl/csrc/sampling/ppr_forward_push.h index 7f0c92f49..82973ff7a 100644 --- a/gigl/csrc/sampling/ppr_forward_push.h +++ b/gigl/csrc/sampling/ppr_forward_push.h @@ -118,4 +118,15 @@ class PPRForwardPushState { // Neighbor cache // ------------------------------------------------------------------------- std::unordered_map> neighbor_cache_; + + // ------------------------------------------------------------------------- + // Diagnostics (populated during the algorithm; read after convergence) + // ------------------------------------------------------------------------- + // Total nodes drained (across all seeds and node types) in each drain_queue() + // call. One entry per loop iteration; useful for understanding convergence speed. + std::vector nodes_drained_per_iteration_; + + public: + // Returns nodes_drained_per_iteration_ built up across all drain_queue() calls. + const std::vector& get_nodes_drained_per_iteration() const; }; diff --git a/gigl/csrc/sampling/ppr_forward_push.pyi b/gigl/csrc/sampling/ppr_forward_push.pyi index 265468c3c..9a3c78fea 100644 --- a/gigl/csrc/sampling/ppr_forward_push.pyi +++ b/gigl/csrc/sampling/ppr_forward_push.pyi @@ -19,3 +19,4 @@ class PPRForwardPushState: def extract_top_k( self, max_ppr_nodes: int ) -> dict[int, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]: ... + def get_nodes_drained_per_iteration(self) -> list[int]: ... diff --git a/gigl/csrc/sampling/python_ppr_forward_push.cpp b/gigl/csrc/sampling/python_ppr_forward_push.cpp index ebf3fa27a..4a296abf8 100644 --- a/gigl/csrc/sampling/python_ppr_forward_push.cpp +++ b/gigl/csrc/sampling/python_ppr_forward_push.cpp @@ -59,5 +59,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { std::vector>()) .def("drain_queue", drain_queue_wrapper) .def("push_residuals", push_residuals_wrapper) - .def("extract_top_k", extract_top_k_wrapper); + .def("extract_top_k", extract_top_k_wrapper) + .def("get_nodes_drained_per_iteration", &PPRForwardPushState::get_nodes_drained_per_iteration); } diff --git a/gigl/distributed/dist_ppr_sampler.py b/gigl/distributed/dist_ppr_sampler.py index 891b70772..b27786a31 100644 --- a/gigl/distributed/dist_ppr_sampler.py +++ b/gigl/distributed/dist_ppr_sampler.py @@ -13,10 +13,13 @@ from graphlearn_torch.typing import EdgeType, NodeType from graphlearn_torch.utils import merge_dict +from gigl.common.logger import Logger from gigl.csrc.sampling import PPRForwardPushState from gigl.distributed.base_sampler import BaseDistNeighborSampler from gigl.types.graph import is_label_edge_type +_logger = Logger() + # Trailing "." is an intentional separator. These constants are used both to # write metadata keys (f"{KEY}{repr(edge_type)}" → e.g. "ppr_edge_index.('user', 'to', 'story')") # and as the strip prefix in extract_edge_type_metadata (key[len(prefix):] must @@ -26,6 +29,10 @@ PPR_FETCH_TIME_MS_METADATA_KEY = "ppr_fetch_time_ms" PPR_PUSH_TIME_MS_METADATA_KEY = "ppr_push_time_ms" PPR_ITERATIONS_METADATA_KEY = "ppr_iterations" +PPR_NODES_PER_ITERATION_METADATA_KEY = "ppr_nodes_per_iteration" +PPR_FETCH_TIME_PER_ITER_MS_METADATA_KEY = "ppr_fetch_time_per_iter_ms" +PPR_PUSH_TIME_PER_ITER_MS_METADATA_KEY = "ppr_push_time_per_iter_ms" +PPR_NODES_NEEDING_FETCH_PER_ITER_METADATA_KEY = "ppr_nodes_needing_fetch_per_iter" # Sentinel type names for homogeneous graphs. The PPR algorithm uses # dict[NodeType, ...] internally for both homo and hetero graphs; these @@ -80,9 +87,10 @@ class DistPPRNeighborSampler(BaseDistNeighborSampler): but require more computation. Typical values: 1e-4 to 1e-6. max_ppr_nodes: Maximum number of nodes to return per seed based on PPR scores. num_neighbors_per_hop: Maximum number of neighbors to fetch per hop. - total_degree_dtype: Dtype for precomputed total-degree tensors. Defaults to - ``torch.int32``, which supports total degrees up to ~2 billion. Use a - larger dtype if nodes have exceptionally high aggregate degrees. + total_degree_dtype: Dtype for precomputed total-degree tensors. Defaults + to ``torch.int32``. Use a larger dtype if nodes have exceptionally high + aggregate degrees. + degree_tensors: Pre-computed degree tensors from the dataset. """ def __init__( @@ -94,6 +102,7 @@ def __init__( num_neighbors_per_hop: int = 100_000, total_degree_dtype: torch.dtype = torch.int32, degree_tensors: Union[torch.Tensor, dict[EdgeType, torch.Tensor]], + max_fetch_iterations: int = 0, **kwargs, ): super().__init__(*args, **kwargs) @@ -102,6 +111,7 @@ def __init__( self._max_ppr_nodes = max_ppr_nodes self._requeue_threshold_factor = alpha * eps self._num_neighbors_per_hop = num_neighbors_per_hop + self._max_fetch_iterations = max_fetch_iterations # Build mapping from node type to edge types that can be traversed from that node type. self._node_type_to_edge_types: dict[NodeType, list[EdgeType]] = defaultdict( @@ -321,7 +331,7 @@ async def _compute_ppr_scores( Union[torch.Tensor, dict[NodeType, torch.Tensor]], Union[torch.Tensor, dict[NodeType, torch.Tensor]], Union[torch.Tensor, dict[NodeType, torch.Tensor]], - tuple[float, float, int], + tuple[float, float, int, list[int], list[float], list[float], list[int]], ]: """ Compute PPR scores for seed nodes using the push-based approximation algorithm. @@ -391,6 +401,10 @@ async def _compute_ppr_scores( total_fetch_ms = 0.0 total_push_ms = 0.0 total_iterations = 0 + fetch_iteration_count = 0 + fetch_ms_per_iter: list[float] = [] + push_ms_per_iter: list[float] = [] + nodes_needing_fetch_per_iter: list[int] = [] while True: # drain_queue returns None when the queue is truly empty (convergence), @@ -398,23 +412,41 @@ async def _compute_ppr_scores( # means all drained nodes either had cached neighbors or no outgoing # edges — we still call push_residuals to flush their residuals into # ppr_scores_. - drain_result: dict[int, torch.Tensor] | None = ppr_state.drain_queue() + drain_result: Optional[dict[int, torch.Tensor]] = ppr_state.drain_queue() if drain_result is None: break nodes_by_etype_id: dict[int, torch.Tensor] = drain_result - if nodes_by_etype_id: + fetch_budget_remaining = ( + self._max_fetch_iterations == 0 + or fetch_iteration_count < self._max_fetch_iterations + ) + if nodes_by_etype_id and fetch_budget_remaining: + # Total (node, edge_type) pairs needing RPCs this iteration. + # A node with uncached neighbors for k edge types contributes k here. + nodes_needing_fetch = sum(t.numel() for t in nodes_by_etype_id.values()) fetch_start = time.perf_counter() fetched_by_etype_id = await self._batch_fetch_neighbors( nodes_by_etype_id, device ) - total_fetch_ms += (time.perf_counter() - fetch_start) * 1000 + iter_fetch_ms = (time.perf_counter() - fetch_start) * 1000 + total_fetch_ms += iter_fetch_ms + fetch_iteration_count += 1 else: + # Either all nodes are cached, or the fetch budget is exhausted. + # push_residuals will propagate using the existing neighbor cache. + nodes_needing_fetch = 0 fetched_by_etype_id = {} + iter_fetch_ms = 0.0 + + nodes_needing_fetch_per_iter.append(nodes_needing_fetch) + fetch_ms_per_iter.append(iter_fetch_ms) push_start = time.perf_counter() ppr_state.push_residuals(fetched_by_etype_id) - total_push_ms += (time.perf_counter() - push_start) * 1000 + iter_push_ms = (time.perf_counter() - push_start) * 1000 + total_push_ms += iter_push_ms + push_ms_per_iter.append(iter_push_ms) total_iterations += 1 # Translate ntype_id integer keys back to NodeType strings for the rest @@ -431,7 +463,18 @@ async def _compute_ppr_scores( ntype_to_flat_weights[ntype] = flat_weights.to(device) ntype_to_valid_counts[ntype] = valid_counts.to(device) - timing = (total_fetch_ms, total_push_ms, total_iterations) + nodes_drained_per_iteration: list[ + int + ] = ppr_state.get_nodes_drained_per_iteration() + timing = ( + total_fetch_ms, + total_push_ms, + total_iterations, + nodes_drained_per_iteration, + fetch_ms_per_iter, + push_ms_per_iter, + nodes_needing_fetch_per_iter, + ) if self._is_homogeneous: assert ( len(ntype_to_flat_ids) == 1 @@ -562,16 +605,48 @@ async def _sample_from_nodes( total_fetch_ms = 0.0 total_push_ms = 0.0 total_iterations = 0 + total_nodes_per_iteration: list[int] = [] + total_fetch_ms_per_iter: list[float] = [] + total_push_ms_per_iter: list[float] = [] + total_nodes_needing_fetch_per_iter: list[int] = [] for seed_type, ( ntype_to_flat_ids, ntype_to_flat_weights, ntype_to_valid_counts, - (fetch_ms, push_ms, iterations), + ( + fetch_ms, + push_ms, + iterations, + nodes_per_iter, + fetch_ms_per_iter, + push_ms_per_iter, + nodes_needing_fetch_per_iter, + ), ) in zip(seed_types, ppr_results): total_fetch_ms += fetch_ms total_push_ms += push_ms total_iterations += iterations + for i, count in enumerate(nodes_per_iter): + if i < len(total_nodes_per_iteration): + total_nodes_per_iteration[i] += count + else: + total_nodes_per_iteration.append(count) + for i, val in enumerate(fetch_ms_per_iter): + if i < len(total_fetch_ms_per_iter): + total_fetch_ms_per_iter[i] += val + else: + total_fetch_ms_per_iter.append(val) + for i, val in enumerate(push_ms_per_iter): + if i < len(total_push_ms_per_iter): + total_push_ms_per_iter[i] += val + else: + total_push_ms_per_iter.append(val) + for i, val in enumerate(nodes_needing_fetch_per_iter): + if i < len(total_nodes_needing_fetch_per_iter): + total_nodes_needing_fetch_per_iter[i] += val + else: + total_nodes_needing_fetch_per_iter.append(val) assert isinstance(ntype_to_flat_ids, dict) assert isinstance(ntype_to_flat_weights, dict) assert isinstance(ntype_to_valid_counts, dict) @@ -621,7 +696,10 @@ async def _sample_from_nodes( # rows_dict and cols_dict are keyed by PPR edge type and give # flat local source/destination indices respectively, aligned with # the flat_ids order passed to induce_next. - for ppr_edge_type, flat_weights in ppr_edge_type_to_flat_weights.items(): + for ( + ppr_edge_type, + flat_weights, + ) in ppr_edge_type_to_flat_weights.items(): rows = rows_dict.get(ppr_edge_type) cols = cols_dict.get(ppr_edge_type) if rows is not None and cols is not None: @@ -638,6 +716,18 @@ async def _sample_from_nodes( metadata[PPR_ITERATIONS_METADATA_KEY] = torch.tensor( total_iterations, dtype=torch.long ) + metadata[PPR_NODES_PER_ITERATION_METADATA_KEY] = torch.tensor( + total_nodes_per_iteration, dtype=torch.long + ) + metadata[PPR_FETCH_TIME_PER_ITER_MS_METADATA_KEY] = torch.tensor( + total_fetch_ms_per_iter, dtype=torch.float + ) + metadata[PPR_PUSH_TIME_PER_ITER_MS_METADATA_KEY] = torch.tensor( + total_push_ms_per_iter, dtype=torch.float + ) + metadata[PPR_NODES_NEEDING_FETCH_PER_ITER_METADATA_KEY] = torch.tensor( + total_nodes_needing_fetch_per_iter, dtype=torch.long + ) sample_output = HeteroSamplerOutput( node=node_dict, @@ -669,7 +759,15 @@ async def _sample_from_nodes( homo_flat_ids, homo_flat_weights, homo_valid_counts, - (total_fetch_ms, total_push_ms, total_iterations), + ( + total_fetch_ms, + total_push_ms, + total_iterations, + total_nodes_per_iteration, + total_fetch_ms_per_iter, + total_push_ms_per_iter, + total_nodes_needing_fetch_per_iter, + ), ) = await self._compute_ppr_scores(nodes_to_sample, None) assert isinstance(homo_flat_ids, torch.Tensor) assert isinstance(homo_flat_weights, torch.Tensor) @@ -696,6 +794,18 @@ async def _sample_from_nodes( metadata[PPR_ITERATIONS_METADATA_KEY] = torch.tensor( total_iterations, dtype=torch.long ) + metadata[PPR_NODES_PER_ITERATION_METADATA_KEY] = torch.tensor( + total_nodes_per_iteration, dtype=torch.long + ) + metadata[PPR_FETCH_TIME_PER_ITER_MS_METADATA_KEY] = torch.tensor( + total_fetch_ms_per_iter, dtype=torch.float + ) + metadata[PPR_PUSH_TIME_PER_ITER_MS_METADATA_KEY] = torch.tensor( + total_push_ms_per_iter, dtype=torch.float + ) + metadata[PPR_NODES_NEEDING_FETCH_PER_ITER_METADATA_KEY] = torch.tensor( + total_nodes_needing_fetch_per_iter, dtype=torch.long + ) sample_output = SamplerOutput( node=all_nodes, diff --git a/gigl/distributed/dist_sampling_producer.py b/gigl/distributed/dist_sampling_producer.py index f155bd929..490514c6c 100644 --- a/gigl/distributed/dist_sampling_producer.py +++ b/gigl/distributed/dist_sampling_producer.py @@ -131,6 +131,7 @@ def _sampling_worker_loop( num_neighbors_per_hop=sampler_options.num_neighbors_per_hop, total_degree_dtype=sampler_options.total_degree_dtype, degree_tensors=degree_tensors, + max_fetch_iterations=sampler_options.max_fetch_iterations, ) else: raise NotImplementedError( diff --git a/gigl/distributed/sampler_options.py b/gigl/distributed/sampler_options.py index 639a932a5..d72c9092e 100644 --- a/gigl/distributed/sampler_options.py +++ b/gigl/distributed/sampler_options.py @@ -59,6 +59,12 @@ class PPRSamplerOptions: total_degree_dtype: Dtype for precomputed total-degree tensors. Defaults to ``torch.int32``, which supports total degrees up to ~2 billion. Use a larger dtype if nodes have exceptionally high aggregate degrees. + max_fetch_iterations: Maximum number of iterations that issue RPC neighbor + fetches. After this many fetch iterations, subsequent iterations push + residuals using only already-cached neighbor lists (no new RPCs). + The algorithm still runs to convergence — re-enqueued nodes propagate + through cached neighbors at negligible cost. Set to 0 (default) for + no fetch limit. """ alpha: float = 0.5 @@ -66,6 +72,7 @@ class PPRSamplerOptions: max_ppr_nodes: int = 50 num_neighbors_per_hop: int = 1_000 total_degree_dtype: torch.dtype = torch.int32 + max_fetch_iterations: int = 0 SamplerOptions = Union[KHopNeighborSamplerOptions, PPRSamplerOptions] From 35e52bcda2527b34bb7ffb885e13585a6193ec8d Mon Sep 17 00:00:00 2001 From: mkolodner Date: Fri, 10 Apr 2026 18:00:52 +0000 Subject: [PATCH 11/14] Add event loop threading --- gigl/csrc/sampling/python_ppr_forward_push.cpp | 9 ++++++++- gigl/distributed/dist_ppr_sampler.py | 4 +++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/gigl/csrc/sampling/python_ppr_forward_push.cpp b/gigl/csrc/sampling/python_ppr_forward_push.cpp index 4a296abf8..78de24cbc 100644 --- a/gigl/csrc/sampling/python_ppr_forward_push.cpp +++ b/gigl/csrc/sampling/python_ppr_forward_push.cpp @@ -29,13 +29,20 @@ static py::object drain_queue_wrapper(PPRForwardPushState& self) { // Convert to C++ map before delegating. static void push_residuals_wrapper(PPRForwardPushState& self, py::dict fetched_by_etype_id) { std::unordered_map> cpp_map; + // Dict iteration touches Python objects — GIL must be held here. for (auto item : fetched_by_etype_id) { int32_t eid = item.first.cast(); auto tup = item.second.cast(); cpp_map[eid] = {tup[0].cast(), tup[1].cast(), tup[2].cast()}; } - self.push_residuals(cpp_map); + // C++ push only uses tensor accessor/data_ptr APIs — GIL-safe to release. + // Releasing here lets the asyncio event loop process RPC completion callbacks + // from other concurrent PPR coroutines while this push runs. + { + py::gil_scoped_release release; + self.push_residuals(cpp_map); + } } // extract_top_k: C++ returns map>. diff --git a/gigl/distributed/dist_ppr_sampler.py b/gigl/distributed/dist_ppr_sampler.py index b27786a31..dfb9220d2 100644 --- a/gigl/distributed/dist_ppr_sampler.py +++ b/gigl/distributed/dist_ppr_sampler.py @@ -443,7 +443,9 @@ async def _compute_ppr_scores( fetch_ms_per_iter.append(iter_fetch_ms) push_start = time.perf_counter() - ppr_state.push_residuals(fetched_by_etype_id) + await asyncio.get_running_loop().run_in_executor( + None, ppr_state.push_residuals, fetched_by_etype_id + ) iter_push_ms = (time.perf_counter() - push_start) * 1000 total_push_ms += iter_push_ms push_ms_per_iter.append(iter_push_ms) From 4a3beac2b24b062842eb520900c8592605a5402b Mon Sep 17 00:00:00 2001 From: mkolodner Date: Fri, 10 Apr 2026 21:01:20 +0000 Subject: [PATCH 12/14] Update guidance --- .clang-tidy | 8 +++++++- docs/cpp_style_guide.md | 17 +++++++---------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index 279b7f030..3198f4627 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -40,6 +40,12 @@ WarningsAsErrors: '*' HeaderFilterRegex: '.*' FormatStyle: none User: jenkins +# CheckOptions: per-check tuning parameters. Each entry configures a specific +# option for an individual check, using the form: +# key: . +# value: +# These let you adjust thresholds, naming patterns, and behavior without +# enabling or disabling the check entirely. CheckOptions: - key: bugprone-argument-comment.StrictMode value: '0' @@ -142,7 +148,7 @@ CheckOptions: - key: performance-type-promotion-in-math-fn.IncludeStyle value: llvm - key: readability-braces-around-statements.ShortStatementLines - value: '2' + value: '0' - key: readability-function-size.BranchThreshold value: '4294967295' - key: readability-function-size.LineThreshold diff --git a/docs/cpp_style_guide.md b/docs/cpp_style_guide.md index 4aa0c30c1..8fc84296e 100644 --- a/docs/cpp_style_guide.md +++ b/docs/cpp_style_guide.md @@ -5,16 +5,13 @@ GiGL enforces C++ style automatically via two tools: - **clang-format** (`.clang-format`) — code formatting - **clang-tidy** (`.clang-tidy`) — static analysis and lint -Both run as part of CI. All clang-tidy warnings are treated as errors. +All clang-tidy warnings are treated as errors. ## Running the Tools ```bash -# Format all C++ files in-place -clang-format -i $(find gigl/csrc -name '*.cpp' -o -name '*.h') - -# Run static analysis -clang-tidy gigl/csrc/**/*.cpp +make format_cpp # Format all C++ files in-place +make lint_cpp # Run clang-tidy static analysis ``` ______________________________________________________________________ @@ -99,11 +96,11 @@ Enabled families: | -------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | | `boost-use-to-string` | Prefer `std::to_string` over `boost::lexical_cast` for numeric conversions | | `bugprone-*` | Common programming mistakes: dangling handles, suspicious string construction, assert side effects, etc. | -| `cert-*` (selected) | CERT secure coding rules for error handling (`err34-c`), floating-point loops (`flp30-c`), and RNG seeding (`msc32-c`, `msc50/51-cpp`) | +| `cert-*` | CERT secure coding rules for error handling (`err34-c`), floating-point loops (`flp30-c`), and RNG seeding (`msc32-c`, `msc50/51-cpp`) | | `clang-analyzer-*` | Clang static analyzer: memory safety, null dereferences, use-after-free, etc. | | `clang-diagnostic-*` | Compiler diagnostic warnings surfaced as lint checks | -| `cppcoreguidelines-*` (selected) | C++ Core Guidelines: no raw `malloc`, no union member access, no object slicing, safe downcasts | -| `google-*` (selected) | Google C++ style: explicit constructors, no global names in headers, safe `memset` usage | +| `cppcoreguidelines-*` | C++ Core Guidelines: no raw `malloc`, no union member access, no object slicing, safe downcasts | +| `google-*` | Google C++ style: explicit constructors, no global names in headers, safe `memset` usage | | `hicpp-exception-baseclass` | All thrown exceptions must derive from `std::exception` | | `misc-*` | Miscellaneous: header-only definitions, suspicious enum usage, throw-by-value/catch-by-reference, etc. | | `modernize-*` | Modernize to C++11/14/17: `nullptr`, range-based for, `make_unique`, `using` aliases, etc. | @@ -150,4 +147,4 @@ Enforced via `readability-identifier-naming`: | `bugprone-string-constructor.LargeLengthThreshold` | `8388608` (8 MB) | Strings larger than 8 MB from a length argument are flagged | | `modernize-loop-convert.NamingStyle` | `CamelCase` | Auto-generated loop variable names use CamelCase | | `readability-function-size.LineThreshold` | `1000` | Functions over 1000 lines are flagged | -| `readability-braces-around-statements.ShortStatementLines` | `2` | Single-line bodies up to 2 lines may omit braces | +| `readability-braces-around-statements.ShortStatementLines` | `0` | Braces required for all control-flow bodies, even single-line | From 1d1dbfd290c2106044aa2035c9dceb744f586ba9 Mon Sep 17 00:00:00 2001 From: mkolodner Date: Fri, 10 Apr 2026 21:38:11 +0000 Subject: [PATCH 13/14] Update --- Makefile | 4 +-- gigl/scripts/post_install.py | 47 +++++++--------------------- requirements/install_cpp_deps.sh | 12 ++++--- scripts/build_cpp_extensions.py | 10 +++--- scripts/cpp_build_constants.py | 17 ++++++++++ scripts/generate_compile_commands.py | 38 ++++++++-------------- 6 files changed, 55 insertions(+), 73 deletions(-) create mode 100644 scripts/cpp_build_constants.py diff --git a/Makefile b/Makefile index 28bbd3d18..79e2a02ba 100644 --- a/Makefile +++ b/Makefile @@ -77,7 +77,7 @@ assert_yaml_configs_parse: # Ex. `make unit_test_py PY_TEST_FILES="eval_metrics_test.py"` # By default, runs all tests under tests/unit. # See the help text for "--test_file_pattern" in tests/test_args.py for more details. -unit_test_py: clean_build_files_py type_check build_cpp_extensions +unit_test_py: clean_build_files_py build_cpp_extensions type_check uv run python -m tests.unit.main \ --env=test \ --resource_config_uri=${GIGL_TEST_DEFAULT_RESOURCE_CONFIG} \ @@ -119,7 +119,7 @@ check_format_md: check_format_cpp: $(if $(CPP_SOURCES), clang-format --dry-run --Werror --style=file $(CPP_SOURCES)) -check_format: check_format_py check_format_scala check_format_md check_format_cpp +check_format: check_format_py check_format_cpp check_format_scala check_format_md # Set PY_TEST_FILES= to test a specifc file. # Ex. `make integration_test PY_TEST_FILES="dataflow_test.py"` diff --git a/gigl/scripts/post_install.py b/gigl/scripts/post_install.py index d31b4244a..bafc4ae21 100644 --- a/gigl/scripts/post_install.py +++ b/gigl/scripts/post_install.py @@ -11,28 +11,6 @@ import subprocess import sys from pathlib import Path -from typing import Optional - - -def run_command_and_stream_stdout(cmd: str) -> Optional[int]: - """ - Executes a command and streams the stdout output. - - Args: - cmd (str): The command to be executed. - - Returns: - Optional[int]: The return code of the command, or None if the command failed to execute. - """ - process = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) - while True: - output = process.stdout.readline() # type: ignore - if output == b"" and process.poll() is not None: - break - if output: - print(output.strip()) - return_code: Optional[int] = process.poll() - return return_code def main(): @@ -49,26 +27,26 @@ def main(): sys.exit(1) try: - print(f"Executing bash {install_glt_script}...") - result = run_command_and_stream_stdout(f"bash {install_glt_script}") - print("GLT install finished with return code:", result) - except Exception as e: - print(f"Unexpected error: {e}") + print(f"Installing GLT via {install_glt_script}...") + subprocess.run(["bash", str(install_glt_script)], check=True) + print("GLT install finished.") + except subprocess.CalledProcessError as e: + print(f"Error installing GLT: {e}") sys.exit(1) # Step 2: Build pybind11 C++ extensions in-place so they are importable # without requiring a separate `make build_cpp_extensions` call. # subprocess.run streams stdout/stderr to the terminal and raises # CalledProcessError on a non-zero exit code. + build_cpp_script = repo_root / "scripts" / "build_cpp_extensions.py" + if not build_cpp_script.exists(): + print(f"Error: build_cpp_extensions.py not found at {build_cpp_script}") + sys.exit(1) + try: print("Building C++ extensions...") subprocess.run( - [ - sys.executable, - "scripts/build_cpp_extensions.py", - "build_ext", - "--inplace", - ], + [sys.executable, str(build_cpp_script), "build_ext", "--inplace"], cwd=repo_root, check=True, ) @@ -76,9 +54,6 @@ def main(): except subprocess.CalledProcessError as e: print(f"Error building C++ extensions: {e}") sys.exit(1) - except Exception as e: - print(f"Unexpected error building C++ extensions: {e}") - sys.exit(1) if __name__ == "__main__": diff --git a/requirements/install_cpp_deps.sh b/requirements/install_cpp_deps.sh index 6b9f19113..1e1a6cf16 100644 --- a/requirements/install_cpp_deps.sh +++ b/requirements/install_cpp_deps.sh @@ -6,10 +6,15 @@ # # Called by `make install_dev_deps` alongside install_py_deps.sh and # install_scala_deps.sh. +# +# NOTE: On Linux, this script calls apt-get and update-alternatives, which +# require root privileges. Run as root or prefix with sudo. set -e set -x +CLANG_VERSION=15 + is_running_on_mac() { [ "$(uname)" == "Darwin" ] return $? @@ -35,12 +40,11 @@ if is_running_on_mac; then # install_dev_deps to pick up the PATH change. export PATH="$LLVM_BIN:$PATH" else - # Ubuntu / Debian — clang 15 is the highest version available on Ubuntu 22.04. - apt-get install -y clang-format-15 clang-tidy-15 cmake + apt-get install -y "clang-format-${CLANG_VERSION}" "clang-tidy-${CLANG_VERSION}" cmake # Register versioned binaries as the default so bare `clang-format` and # `clang-tidy` resolve to them without callers specifying the version suffix. - update-alternatives --install /usr/bin/clang-format clang-format /usr/bin/clang-format-15 100 - update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-15 100 + update-alternatives --install /usr/bin/clang-format clang-format "/usr/bin/clang-format-${CLANG_VERSION}" 100 + update-alternatives --install /usr/bin/clang-tidy clang-tidy "/usr/bin/clang-tidy-${CLANG_VERSION}" 100 fi echo "Finished installing C++ tooling" diff --git a/scripts/build_cpp_extensions.py b/scripts/build_cpp_extensions.py index 5bc3dd8bf..14a6d3911 100644 --- a/scripts/build_cpp_extensions.py +++ b/scripts/build_cpp_extensions.py @@ -8,13 +8,11 @@ python build_cpp_extensions.py build_ext --inplace """ -from pathlib import Path - from setuptools import setup from setuptools.extension import Extension from torch.utils.cpp_extension import BuildExtension, CppExtension -_CSRC_DIR = Path("gigl/csrc") +from cpp_build_constants import COMPILE_ARGS, CSRC_DIR def find_cpp_extensions() -> list[Extension]: @@ -25,10 +23,10 @@ def find_cpp_extensions() -> list[Extension]: Returns an empty list if ``gigl/csrc/`` does not yet exist. """ - if not _CSRC_DIR.exists(): + if not CSRC_DIR.exists(): return [] extensions = [] - for cpp_file in sorted(_CSRC_DIR.rglob("python_*.cpp")): + for cpp_file in sorted(CSRC_DIR.rglob("python_*.cpp")): parts = list(cpp_file.with_suffix("").parts) parts[-1] = parts[-1].removeprefix("python_") module_name = ".".join(parts) @@ -40,7 +38,7 @@ def find_cpp_extensions() -> list[Extension]: CppExtension( name=module_name, sources=sources, - extra_compile_args=["-O3", "-std=c++17", "-Wall", "-Wextra"], + extra_compile_args=COMPILE_ARGS, ) ) return extensions diff --git a/scripts/cpp_build_constants.py b/scripts/cpp_build_constants.py new file mode 100644 index 000000000..ab1c7cc36 --- /dev/null +++ b/scripts/cpp_build_constants.py @@ -0,0 +1,17 @@ +"""Shared C++ build constants for build_cpp_extensions.py and generate_compile_commands.py. + +This is the single source of truth for C++ compiler flags and source paths. +Both scripts import from here so clang-tidy always analyzes with the same flags +used during the actual build. +""" + +from pathlib import Path + +# REPO_ROOT is derived from this file's location — this file must live in scripts/. +REPO_ROOT: Path = Path(__file__).resolve().parent.parent +CSRC_DIR: Path = REPO_ROOT / "gigl" / "csrc" + +# Flags passed to every C++ compilation unit. Applies to both the extension +# build (build_cpp_extensions.py) and the compile_commands.json used by +# clang-tidy (generate_compile_commands.py). +COMPILE_ARGS: list[str] = ["-O3", "-std=c++17", "-Wall", "-Wextra"] diff --git a/scripts/generate_compile_commands.py b/scripts/generate_compile_commands.py index eec176848..d42d52682 100644 --- a/scripts/generate_compile_commands.py +++ b/scripts/generate_compile_commands.py @@ -9,58 +9,46 @@ uv run python scripts/generate_compile_commands.py Output: ``build/compile_commands.json`` (created or overwritten). + +Note: run ``make build_cpp_extensions`` before this script (or use ``make lint_cpp``, +which does both in the correct order) so the database reflects the current build state. """ import json -import subprocess import sys import sysconfig -from pathlib import Path from torch.utils.cpp_extension import include_paths as torch_include_paths +from cpp_build_constants import COMPILE_ARGS, CSRC_DIR, REPO_ROOT -def main() -> None: - repo_root = Path(__file__).parent.parent.resolve() - - # Always rebuild C++ extensions before generating compile_commands.json so - # the database reflects the current state of the code. - subprocess.run( - [sys.executable, "scripts/build_cpp_extensions.py", "build_ext", "--inplace"], - cwd=repo_root, - check=True, - ) +def main() -> None: # Collect all include directories needed to compile the extension. # torch_include_paths() returns the torch headers, which already bundle # pybind11 under torch/include/pybind11/ — no separate pybind11 import needed. - include_flags: list[str] = [] - for path in torch_include_paths(): - include_flags.append(f"-I{path}") + include_flags: list[str] = [f"-I{path}" for path in torch_include_paths()] # Python C API headers (e.g. Python.h) required by pybind11. include_flags.append(f"-I{sysconfig.get_path('include')}") - cpp_dir = repo_root / "gigl" / "csrc" - cpp_sources = sorted(cpp_dir.rglob("*.cpp")) if cpp_dir.exists() else [] + cpp_sources = sorted(CSRC_DIR.rglob("*.cpp")) if CSRC_DIR.exists() else [] if not cpp_sources: - print("Warning: no .cpp files found under gigl/csrc/", file=sys.stderr) + print(f"Warning: no .cpp files found under {CSRC_DIR}", file=sys.stderr) + + cxx_flags = " ".join(COMPILE_ARGS) # Each entry in compile_commands.json describes how one source file is compiled. # clang-tidy reads this to reproduce the exact compilation environment. commands: list[dict[str, str]] = [ { - "directory": str(repo_root), + "directory": str(REPO_ROOT), "file": str(source), - "command": ( - f"c++ -std=c++17 -Wall -Wextra " - f"{' '.join(include_flags)} " - f"-c {source}" - ), + "command": f"c++ {cxx_flags} {' '.join(include_flags)} -c {source}", } for source in cpp_sources ] - output = repo_root / "build" / "compile_commands.json" + output = REPO_ROOT / "build" / "compile_commands.json" output.parent.mkdir(exist_ok=True) output.write_text(json.dumps(commands, indent=2)) print( From fe2cc0b672df8a25e531a90f2c95e023df5485d0 Mon Sep 17 00:00:00 2001 From: mkolodner Date: Tue, 14 Apr 2026 00:05:43 +0000 Subject: [PATCH 14/14] Update type check --- gigl/csrc/sampling/ppr_forward_push.cpp | 284 +++++++++--------- gigl/csrc/sampling/ppr_forward_push.h | 87 +++--- .../csrc/sampling/python_ppr_forward_push.cpp | 34 +-- 3 files changed, 210 insertions(+), 195 deletions(-) diff --git a/gigl/csrc/sampling/ppr_forward_push.cpp b/gigl/csrc/sampling/ppr_forward_push.cpp index 436967cd7..f97b2f40c 100644 --- a/gigl/csrc/sampling/ppr_forward_push.cpp +++ b/gigl/csrc/sampling/ppr_forward_push.cpp @@ -1,209 +1,218 @@ #include "ppr_forward_push.h" -PPRForwardPushState::PPRForwardPushState(torch::Tensor seed_nodes, - int32_t seed_node_type_id, +PPRForwardPushState::PPRForwardPushState(const torch::Tensor& seedNodes, + int32_t seedNodeTypeId, double alpha, - double requeue_threshold_factor, - std::vector> node_type_to_edge_type_ids, - std::vector edge_type_to_dst_ntype_id, - std::vector degree_tensors) - : alpha_(alpha), - one_minus_alpha_(1.0 - alpha), - requeue_threshold_factor_(requeue_threshold_factor), + double requeueThresholdFactor, + std::vector> nodeTypeToEdgeTypeIds, + std::vector edgeTypeToDstNtypeId, + std::vector degreeTensors) + : _alpha(alpha), + _oneMinusAlpha(1.0 - alpha), + _requeueThresholdFactor(requeueThresholdFactor), // std::move transfers ownership of each vector into the member variable // without copying its contents — equivalent to Python's list hand-off // when you no longer need the original. - node_type_to_edge_type_ids_(std::move(node_type_to_edge_type_ids)), - edge_type_to_dst_ntype_id_(std::move(edge_type_to_dst_ntype_id)), - degree_tensors_(std::move(degree_tensors)) { - TORCH_CHECK(seed_nodes.dim() == 1, "seed_nodes must be 1D"); - batch_size_ = static_cast(seed_nodes.size(0)); - num_node_types_ = static_cast(node_type_to_edge_type_ids_.size()); + _nodeTypeToEdgeTypeIds(std::move(nodeTypeToEdgeTypeIds)), + _edgeTypeToDstNtypeId(std::move(edgeTypeToDstNtypeId)), + _degreeTensors(std::move(degreeTensors)) { + TORCH_CHECK(seedNodes.dim() == 1, "seedNodes must be 1D"); + _batchSize = static_cast(seedNodes.size(0)); + _numNodeTypes = static_cast(_nodeTypeToEdgeTypeIds.size()); // Allocate per-seed, per-node-type tables. // .assign(n, val) fills a vector with n copies of val — like [val] * n in Python. - ppr_scores_.assign(batch_size_, std::vector>(num_node_types_)); - residuals_.assign(batch_size_, std::vector>(num_node_types_)); - queue_.assign(batch_size_, std::vector>(num_node_types_)); - queued_nodes_.assign(batch_size_, std::vector>(num_node_types_)); + _pprScores.assign(_batchSize, std::vector>(_numNodeTypes)); + _residuals.assign(_batchSize, std::vector>(_numNodeTypes)); + _queue.assign(_batchSize, std::vector>(_numNodeTypes)); + _queuedNodes.assign(_batchSize, std::vector>(_numNodeTypes)); // accessor() returns a typed view into the tensor's data that // supports [i] indexing with bounds checking in debug builds. - auto acc = seed_nodes.accessor(); - num_nodes_in_queue_ = batch_size_; - for (int32_t i = 0; i < batch_size_; ++i) { - int32_t seed = static_cast(acc[i]); + auto acc = seedNodes.accessor(); + _numNodesInQueue = _batchSize; + for (int32_t i = 0; i < _batchSize; ++i) { + auto seed = static_cast(acc[i]); // PPR initialisation: each seed starts with residual = alpha (the // restart probability). The first push will move alpha into ppr_score // and distribute (1-alpha)*alpha to the seed's neighbors. - residuals_[i][seed_node_type_id][seed] = alpha_; - queue_[i][seed_node_type_id].insert(seed); + _residuals[i][seedNodeTypeId][seed] = _alpha; + _queue[i][seedNodeTypeId].insert(seed); } } -std::optional> PPRForwardPushState::drain_queue() { - if (num_nodes_in_queue_ == 0) { +std::optional> PPRForwardPushState::drainQueue() { + if (_numNodesInQueue == 0) { return std::nullopt; } // Reset the snapshot from the previous iteration. - for (int32_t s = 0; s < batch_size_; ++s) - for (auto& qs : queued_nodes_[s]) + for (int32_t s = 0; s < _batchSize; ++s) { + for (auto& qs : _queuedNodes[s]) { qs.clear(); + } + } - // nodes_to_lookup[eid] = set of node IDs that need a neighbor fetch for + // nodesToLookup[eid] = set of node IDs that need a neighbor fetch for // edge type eid this round. Using a set deduplicates nodes that appear // in multiple seeds' queues: we only fetch each (node, etype) pair once. - std::unordered_map> nodes_to_lookup; + std::unordered_map> nodesToLookup; - int32_t total_drained_this_round = 0; - for (int32_t s = 0; s < batch_size_; ++s) { - for (int32_t nt = 0; nt < num_node_types_; ++nt) { - if (queue_[s][nt].empty()) + int32_t totalDrainedThisRound = 0; + for (int32_t s = 0; s < _batchSize; ++s) { + for (int32_t nt = 0; nt < _numNodeTypes; ++nt) { + if (_queue[s][nt].empty()) { continue; + } // Move the live queue into the snapshot (no data copy — O(1)). - queued_nodes_[s][nt] = std::move(queue_[s][nt]); - queue_[s][nt].clear(); - total_drained_this_round += static_cast(queued_nodes_[s][nt].size()); - num_nodes_in_queue_ -= static_cast(queued_nodes_[s][nt].size()); - - for (int32_t node_id : queued_nodes_[s][nt]) { - for (int32_t eid : node_type_to_edge_type_ids_[nt]) { - if (neighbor_cache_.find(pack_key(node_id, eid)) == neighbor_cache_.end()) { - nodes_to_lookup[eid].insert(node_id); + _queuedNodes[s][nt] = std::move(_queue[s][nt]); + _queue[s][nt].clear(); + totalDrainedThisRound += static_cast(_queuedNodes[s][nt].size()); + _numNodesInQueue -= static_cast(_queuedNodes[s][nt].size()); + + for (int32_t nodeId : _queuedNodes[s][nt]) { + for (int32_t eid : _nodeTypeToEdgeTypeIds[nt]) { + if (_neighborCache.find(packKey(nodeId, eid)) == _neighborCache.end()) { + nodesToLookup[eid].insert(nodeId); } } } } } - nodes_drained_per_iteration_.push_back(total_drained_this_round); + _nodesDrainedPerIteration.push_back(totalDrainedThisRound); std::unordered_map result; - for (auto& [eid, node_set] : nodes_to_lookup) { - std::vector ids(node_set.begin(), node_set.end()); + for (auto& [eid, nodeSet] : nodesToLookup) { + std::vector ids(nodeSet.begin(), nodeSet.end()); result[eid] = torch::tensor(ids, torch::kLong); } return result; } -const std::vector& PPRForwardPushState::get_nodes_drained_per_iteration() const { - return nodes_drained_per_iteration_; +const std::vector& PPRForwardPushState::getNodesDrainedPerIteration() const { + return _nodesDrainedPerIteration; } -void PPRForwardPushState::push_residuals( - const std::unordered_map>& fetched_by_etype_id) { - // Step 1: Unpack the input map into a C++ map keyed by pack_key(node_id, etype_id) +void PPRForwardPushState::pushResiduals( + const std::unordered_map>& fetchedByEtypeId) { + // Step 1: Unpack the input map into a C++ map keyed by packKey(nodeId, etypeId) // for fast lookup during the residual-push loop below. std::unordered_map> fetched; - for (const auto& [eid, tup] : fetched_by_etype_id) { - const auto& node_ids_t = std::get<0>(tup); - const auto& flat_nbrs_t = std::get<1>(tup); - const auto& counts_t = std::get<2>(tup); + for (const auto& [eid, tup] : fetchedByEtypeId) { + const auto& nodeIdsT = std::get<0>(tup); + const auto& flatNbrsT = std::get<1>(tup); + const auto& countsT = std::get<2>(tup); // accessor() gives a bounds-checked, typed 1-D view into // each tensor's data — equivalent to iterating over a NumPy array. - auto node_acc = node_ids_t.accessor(); - auto nbr_acc = flat_nbrs_t.accessor(); - auto cnt_acc = counts_t.accessor(); + auto nodeAcc = nodeIdsT.accessor(); + auto nbrAcc = flatNbrsT.accessor(); + auto cntAcc = countsT.accessor(); // Walk the flat neighbor list, slicing out each node's neighbors using // the running offset into the concatenated flat buffer. int64_t offset = 0; - for (int64_t i = 0; i < node_ids_t.size(0); ++i) { - int32_t nid = static_cast(node_acc[i]); - int64_t count = cnt_acc[i]; + for (int64_t i = 0; i < nodeIdsT.size(0); ++i) { + auto nid = static_cast(nodeAcc[i]); + int64_t count = cntAcc[i]; std::vector nbrs(count); - for (int64_t j = 0; j < count; ++j) - nbrs[j] = static_cast(nbr_acc[offset + j]); - fetched[pack_key(nid, eid)] = std::move(nbrs); + for (int64_t j = 0; j < count; ++j) { + nbrs[j] = static_cast(nbrAcc[offset + j]); + } + fetched[packKey(nid, eid)] = std::move(nbrs); offset += count; } } - // Step 2: For every node that was in the queue (captured in queued_nodes_ - // by drain_queue()), apply one PPR push step: + // Step 2: For every node that was in the queue (captured in _queuedNodes + // by drainQueue()), apply one PPR push step: // a. Absorb residual into the PPR score. // b. Distribute (1-alpha) * residual equally to each neighbor. // c. Enqueue any neighbor whose residual now exceeds the requeue threshold. - for (int32_t s = 0; s < batch_size_; ++s) { - for (int32_t nt = 0; nt < num_node_types_; ++nt) { - if (queued_nodes_[s][nt].empty()) + for (int32_t s = 0; s < _batchSize; ++s) { + for (int32_t nt = 0; nt < _numNodeTypes; ++nt) { + if (_queuedNodes[s][nt].empty()) { continue; + } - for (int32_t src : queued_nodes_[s][nt]) { - auto& src_res = residuals_[s][nt]; - auto it = src_res.find(src); - double res = (it != src_res.end()) ? it->second : 0.0; + for (int32_t src : _queuedNodes[s][nt]) { + auto& srcRes = _residuals[s][nt]; + auto it = srcRes.find(src); + double res = (it != srcRes.end()) ? it->second : 0.0; // a. Absorb: move residual into the PPR score. - ppr_scores_[s][nt][src] += res; - src_res[src] = 0.0; + _pprScores[s][nt][src] += res; + srcRes[src] = 0.0; // b. Count total fetched/cached neighbors across all edge types for // this source node. We normalise by the number of neighbors we // actually retrieved, not the true degree, so residual is fully // distributed among known neighbors rather than leaking to unfetched // ones (which matters when num_neighbors_per_hop < true_degree). - int32_t total_fetched = 0; - for (int32_t eid : node_type_to_edge_type_ids_[nt]) { - auto fi = fetched.find(pack_key(src, eid)); + int32_t totalFetched = 0; + for (int32_t eid : _nodeTypeToEdgeTypeIds[nt]) { + auto fi = fetched.find(packKey(src, eid)); if (fi != fetched.end()) { - total_fetched += static_cast(fi->second.size()); + totalFetched += static_cast(fi->second.size()); } else { - auto ci = neighbor_cache_.find(pack_key(src, eid)); - if (ci != neighbor_cache_.end()) - total_fetched += static_cast(ci->second.size()); + auto ci = _neighborCache.find(packKey(src, eid)); + if (ci != _neighborCache.end()) { + totalFetched += static_cast(ci->second.size()); + } } } // Destination-only nodes (or nodes with no fetched neighbors) absorb // residual but do not push further. - if (total_fetched == 0) + if (totalFetched == 0) { continue; + } - double res_per_nbr = one_minus_alpha_ * res / static_cast(total_fetched); + double resPerNbr = _oneMinusAlpha * res / static_cast(totalFetched); - for (int32_t eid : node_type_to_edge_type_ids_[nt]) { - // Invariant: fetched and neighbor_cache_ are mutually exclusive for - // any given (node, etype) key within one iteration. drain_queue() - // only requests a fetch for nodes absent from neighbor_cache_, so a + for (int32_t eid : _nodeTypeToEdgeTypeIds[nt]) { + // Invariant: fetched and _neighborCache are mutually exclusive for + // any given (node, etype) key within one iteration. drainQueue() + // only requests a fetch for nodes absent from _neighborCache, so a // key is in at most one of the two. - const std::vector* nbr_list = nullptr; - auto fi = fetched.find(pack_key(src, eid)); + const std::vector* nbrList = nullptr; + auto fi = fetched.find(packKey(src, eid)); if (fi != fetched.end()) { - nbr_list = &fi->second; + nbrList = &fi->second; } else { - auto ci = neighbor_cache_.find(pack_key(src, eid)); - if (ci != neighbor_cache_.end()) - nbr_list = &ci->second; + auto ci = _neighborCache.find(packKey(src, eid)); + if (ci != _neighborCache.end()) { + nbrList = &ci->second; + } } - if (!nbr_list || nbr_list->empty()) + if (!nbrList || nbrList->empty()) { continue; + } - int32_t dst_nt = edge_type_to_dst_ntype_id_[eid]; + int32_t dstNt = _edgeTypeToDstNtypeId[eid]; // c. Accumulate residual for each neighbor and re-enqueue if threshold // exceeded. - for (int32_t nbr : *nbr_list) { - residuals_[s][dst_nt][nbr] += res_per_nbr; + for (int32_t nbr : *nbrList) { + _residuals[s][dstNt][nbr] += resPerNbr; - double threshold = - requeue_threshold_factor_ * static_cast(get_total_degree(nbr, dst_nt)); + double threshold = _requeueThresholdFactor * static_cast(getTotalDegree(nbr, dstNt)); - if (queue_[s][dst_nt].find(nbr) == queue_[s][dst_nt].end() && - residuals_[s][dst_nt][nbr] >= threshold) { - queue_[s][dst_nt].insert(nbr); - ++num_nodes_in_queue_; + if (_queue[s][dstNt].find(nbr) == _queue[s][dstNt].end() && + _residuals[s][dstNt][nbr] >= threshold) { + _queue[s][dstNt].insert(nbr); + ++_numNodesInQueue; // Promote neighbor lists to the persistent cache: this node will // be processed next iteration, so caching avoids a re-fetch. - for (int32_t peid : node_type_to_edge_type_ids_[dst_nt]) { - uint64_t pk = pack_key(nbr, peid); - if (neighbor_cache_.find(pk) == neighbor_cache_.end()) { + for (int32_t peid : _nodeTypeToEdgeTypeIds[dstNt]) { + uint64_t pk = packKey(nbr, peid); + if (_neighborCache.find(pk) == _neighborCache.end()) { auto pfi = fetched.find(pk); - if (pfi != fetched.end()) - neighbor_cache_[pk] = pfi->second; + if (pfi != fetched.end()) { + _neighborCache[pk] = pfi->second; + } } } } @@ -214,23 +223,26 @@ void PPRForwardPushState::push_residuals( } } -std::unordered_map> PPRForwardPushState::extract_top_k( - int32_t max_ppr_nodes) { +std::unordered_map> PPRForwardPushState::extractTopK( + int32_t maxPprNodes) { std::unordered_set active; - for (int32_t s = 0; s < batch_size_; ++s) - for (int32_t nt = 0; nt < num_node_types_; ++nt) - if (!ppr_scores_[s][nt].empty()) + for (int32_t s = 0; s < _batchSize; ++s) { + for (int32_t nt = 0; nt < _numNodeTypes; ++nt) { + if (!_pprScores[s][nt].empty()) { active.insert(nt); + } + } + } std::unordered_map> result; for (int32_t nt : active) { - std::vector flat_ids; - std::vector flat_weights; - std::vector valid_counts; + std::vector flatIds; + std::vector flatWeights; + std::vector validCounts; - for (int32_t s = 0; s < batch_size_; ++s) { - const auto& scores = ppr_scores_[s][nt]; - int32_t k = std::min(max_ppr_nodes, static_cast(scores.size())); + for (int32_t s = 0; s < _batchSize; ++s) { + const auto& scores = _pprScores[s][nt]; + int32_t k = std::min(maxPprNodes, static_cast(scores.size())); if (k > 0) { std::vector> items(scores.begin(), scores.end()); std::partial_sort(items.begin(), items.begin() + k, items.end(), [](const auto& a, const auto& b) { @@ -238,36 +250,38 @@ std::unordered_map(items[i].first)); + flatIds.push_back(static_cast(items[i].first)); // Cast to float32 for output; internal scores stay double to // avoid accumulated rounding errors in the push loop. - flat_weights.push_back(static_cast(items[i].second)); + flatWeights.push_back(static_cast(items[i].second)); } } - valid_counts.push_back(static_cast(k)); + validCounts.push_back(static_cast(k)); } - result[nt] = {torch::tensor(flat_ids, torch::kLong), - torch::tensor(flat_weights, torch::kFloat), - torch::tensor(valid_counts, torch::kLong)}; + result[nt] = {torch::tensor(flatIds, torch::kLong), + torch::tensor(flatWeights, torch::kFloat), + torch::tensor(validCounts, torch::kLong)}; } return result; } -int32_t PPRForwardPushState::get_total_degree(int32_t node_id, int32_t ntype_id) const { - if (ntype_id >= static_cast(degree_tensors_.size())) +int32_t PPRForwardPushState::getTotalDegree(int32_t nodeId, int32_t ntypeId) const { + if (ntypeId >= static_cast(_degreeTensors.size())) { return 0; - const auto& t = degree_tensors_[ntype_id]; - if (t.numel() == 0) + } + const auto& t = _degreeTensors[ntypeId]; + if (t.numel() == 0) { return 0; - TORCH_CHECK(node_id < static_cast(t.size(0)), + } + TORCH_CHECK(nodeId < static_cast(t.size(0)), "Node ID ", - node_id, + nodeId, " out of range for degree tensor of ntype_id ", - ntype_id, + ntypeId, " (size=", t.size(0), "). This indicates corrupted graph data or a sampler bug."); // data_ptr() returns a raw C pointer to the tensor's int32 data buffer. - return t.data_ptr()[node_id]; + return t.data_ptr()[nodeId]; } diff --git a/gigl/csrc/sampling/ppr_forward_push.h b/gigl/csrc/sampling/ppr_forward_push.h index 82973ff7a..7a6fac69d 100644 --- a/gigl/csrc/sampling/ppr_forward_push.h +++ b/gigl/csrc/sampling/ppr_forward_push.h @@ -22,9 +22,9 @@ // negative int32_t (e.g. -1 = 0xFFFFFFFF) would be sign-extended to a full // 64-bit value, corrupting the upper bits when shifted. Reinterpreting as // uint32_t first treats the bit pattern as-is (no sign extension). -static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) { - return (static_cast(static_cast(node_id)) << 32) | - static_cast(etype_id); +static inline uint64_t packKey(int32_t nodeId, int32_t etypeId) { + return (static_cast(static_cast(nodeId)) << 32) | + static_cast(etypeId); } // C++ kernel for the PPR Forward Push algorithm (Andersen et al., 2006). @@ -33,43 +33,45 @@ static inline uint64_t pack_key(int32_t node_id, int32_t etype_id) { // this object. The distributed neighbor fetch is kept in Python because it // involves async RPC calls that C++ cannot drive directly. // -// Owned state: ppr_scores, residuals, queue, queued_nodes, neighbor_cache. +// Owned state: _pprScores, _residuals, _queue, _queuedNodes, _neighborCache. // Python retains ownership of: the distributed neighbor fetch (_batch_fetch_neighbors). // // Typical call sequence per batch: -// 1. PPRForwardPushState(seed_nodes, ...) — init per-seed residuals / queue +// 1. PPRForwardPushState(seedNodes, ...) — init per-seed residuals / queue // while True: -// 2. drain_queue() — drain queue → nodes needing lookup +// 2. drainQueue() — drain queue → nodes needing lookup // 3. — distributed RPC fetch (stays in Python) -// 4. push_residuals(fetched_by_etype_id) — push residuals, update queue -// 5. extract_top_k(max_ppr_nodes) — top-k selection per seed per node type +// 4. pushResiduals(fetchedByEtypeId) — push residuals, update queue +// 5. extractTopK(maxPprNodes) — top-k selection per seed per node type class PPRForwardPushState { public: - PPRForwardPushState(torch::Tensor seed_nodes, int32_t seed_node_type_id, double alpha, - double requeue_threshold_factor, - std::vector> node_type_to_edge_type_ids, - std::vector edge_type_to_dst_ntype_id, - std::vector degree_tensors); + PPRForwardPushState(const torch::Tensor& seedNodes, + int32_t seedNodeTypeId, + double alpha, + double requeueThresholdFactor, + std::vector> nodeTypeToEdgeTypeIds, + std::vector edgeTypeToDstNtypeId, + std::vector degreeTensors); // Drain all queued nodes and return {etype_id: tensor[node_ids]} for batch - // neighbor lookup. Also snapshots the drained nodes into queued_nodes_ for - // use by push_residuals(). + // neighbor lookup. Also snapshots the drained nodes into _queuedNodes for + // use by pushResiduals(). // // Return value semantics: // - std::nullopt → queue was already empty; convergence achieved; stop the loop. - // - empty map → nodes were drained but all were cached; call push_residuals({}). + // - empty map → nodes were drained but all were cached; call pushResiduals({}). // - non-empty map → {etype_id → 1-D int64 tensor of node IDs} needing neighbor lookup. - std::optional> drain_queue(); + std::optional> drainQueue(); // Push residuals to neighbors given the fetched neighbor data. // - // fetched_by_etype_id: {etype_id: (node_ids_tensor, flat_nbrs_tensor, counts_tensor)} + // fetchedByEtypeId: {etype_id: (node_ids_tensor, flat_nbrs_tensor, counts_tensor)} // - node_ids_tensor: [N] int64 — source node IDs fetched for this edge type // - flat_nbrs_tensor: [sum(counts)] int64 — all neighbor lists concatenated flat // - counts_tensor: [N] int64 — neighbor count for each source node - void push_residuals(const std::unordered_map< - int32_t, std::tuple>& - fetched_by_etype_id); + void pushResiduals(const std::unordered_map< + int32_t, std::tuple>& + fetchedByEtypeId); // Extract top-k PPR nodes per seed per node type. // @@ -81,52 +83,51 @@ class PPRForwardPushState { // flat_ids[valid_counts[0] : valid_counts[0]+valid_counts[1]] → top-k for seed 1 // ... std::unordered_map> - extract_top_k(int32_t max_ppr_nodes); + extractTopK(int32_t maxPprNodes); + + // Returns _nodesDrainedPerIteration built up across all drainQueue() calls. + [[nodiscard]] const std::vector& getNodesDrainedPerIteration() const; private: // Look up the total (across all edge types) out-degree of a node. // Returns 0 for destination-only node types (no outgoing edges). - int32_t get_total_degree(int32_t node_id, int32_t ntype_id) const; + [[nodiscard]] int32_t getTotalDegree(int32_t nodeId, int32_t ntypeId) const; // ------------------------------------------------------------------------- // Scalar algorithm parameters // ------------------------------------------------------------------------- - double alpha_; // Restart probability - double one_minus_alpha_; // 1 - alpha, precomputed to avoid repeated subtraction - double requeue_threshold_factor_; // alpha * eps; multiplied by degree to get per-node threshold + double _alpha; // Restart probability + double _oneMinusAlpha; // 1 - alpha, precomputed to avoid repeated subtraction + double _requeueThresholdFactor; // alpha * eps; multiplied by degree to get per-node threshold - int32_t batch_size_; // Number of seeds in the current batch - int32_t num_node_types_; // Total number of node types (homo + hetero) - int32_t num_nodes_in_queue_{0}; // Running count of nodes across all seeds / types + int32_t _batchSize; // Number of seeds in the current batch + int32_t _numNodeTypes; // Total number of node types (homo + hetero) + int32_t _numNodesInQueue{0}; // Running count of nodes across all seeds / types // ------------------------------------------------------------------------- // Graph structure (read-only after construction) // ------------------------------------------------------------------------- - std::vector> node_type_to_edge_type_ids_; - std::vector edge_type_to_dst_ntype_id_; - std::vector degree_tensors_; + std::vector> _nodeTypeToEdgeTypeIds; + std::vector _edgeTypeToDstNtypeId; + std::vector _degreeTensors; // ------------------------------------------------------------------------- // Per-seed, per-node-type PPR state (indexed [seed_idx][ntype_id]) // ------------------------------------------------------------------------- - std::vector>> ppr_scores_; - std::vector>> residuals_; - std::vector>> queue_; - std::vector>> queued_nodes_; + std::vector>> _pprScores; + std::vector>> _residuals; + std::vector>> _queue; + std::vector>> _queuedNodes; // ------------------------------------------------------------------------- // Neighbor cache // ------------------------------------------------------------------------- - std::unordered_map> neighbor_cache_; + std::unordered_map> _neighborCache; // ------------------------------------------------------------------------- // Diagnostics (populated during the algorithm; read after convergence) // ------------------------------------------------------------------------- - // Total nodes drained (across all seeds and node types) in each drain_queue() + // Total nodes drained (across all seeds and node types) in each drainQueue() // call. One entry per loop iteration; useful for understanding convergence speed. - std::vector nodes_drained_per_iteration_; - - public: - // Returns nodes_drained_per_iteration_ built up across all drain_queue() calls. - const std::vector& get_nodes_drained_per_iteration() const; + std::vector _nodesDrainedPerIteration; }; diff --git a/gigl/csrc/sampling/python_ppr_forward_push.cpp b/gigl/csrc/sampling/python_ppr_forward_push.cpp index aafb32cdc..98ff40179 100644 --- a/gigl/csrc/sampling/python_ppr_forward_push.cpp +++ b/gigl/csrc/sampling/python_ppr_forward_push.cpp @@ -11,10 +11,10 @@ namespace py = pybind11; -// drain_queue: C++ returns std::optional>. +// drainQueue: C++ returns std::optional>. // Exposed to Python as: None (convergence) or dict[int, Tensor]. -static py::object drain_queue_wrapper(PPRForwardPushState& self) { - auto result = self.drain_queue(); +static py::object drainQueueWrapper(PPRForwardPushState& self) { + auto result = self.drainQueue(); if (!result) { return py::none(); } @@ -25,29 +25,29 @@ static py::object drain_queue_wrapper(PPRForwardPushState& self) { return d; } -// push_residuals: Python passes dict[int, tuple[Tensor, Tensor, Tensor]]. +// pushResiduals: Python passes dict[int, tuple[Tensor, Tensor, Tensor]]. // Convert to C++ map before delegating. -static void push_residuals_wrapper(PPRForwardPushState& self, py::dict fetched_by_etype_id) { - std::unordered_map> cpp_map; +static void pushResidualsWrapper(PPRForwardPushState& self, const py::dict& fetchedByEtypeId) { + std::unordered_map> cppMap; // Dict iteration touches Python objects — GIL must be held here. - for (auto item : fetched_by_etype_id) { - int32_t eid = item.first.cast(); + for (auto item : fetchedByEtypeId) { + auto eid = item.first.cast(); auto tup = item.second.cast(); - cpp_map[eid] = {tup[0].cast(), tup[1].cast(), tup[2].cast()}; + cppMap[eid] = {tup[0].cast(), tup[1].cast(), tup[2].cast()}; } // C++ push only uses tensor accessor/data_ptr APIs — GIL-safe to release. // Releasing here lets the asyncio event loop process RPC completion callbacks // from other concurrent PPR coroutines while this push runs. { py::gil_scoped_release release; - self.push_residuals(cpp_map); + self.pushResiduals(cppMap); } } -// extract_top_k: C++ returns map>. +// extractTopK: C++ returns map>. // Exposed to Python as dict[int, tuple[Tensor, Tensor, Tensor]]. -static py::dict extract_top_k_wrapper(PPRForwardPushState& self, int32_t max_ppr_nodes) { - auto result = self.extract_top_k(max_ppr_nodes); +static py::dict extractTopKWrapper(PPRForwardPushState& self, int32_t maxPprNodes) { + auto result = self.extractTopK(maxPprNodes); py::dict d; for (auto& [nt, tup] : result) { d[py::int_(nt)] = py::make_tuple(std::get<0>(tup), std::get<1>(tup), std::get<2>(tup)); @@ -66,8 +66,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { std::vector>, std::vector, std::vector>()) - .def("drain_queue", drain_queue_wrapper) - .def("push_residuals", push_residuals_wrapper) - .def("extract_top_k", extract_top_k_wrapper) - .def("get_nodes_drained_per_iteration", &PPRForwardPushState::get_nodes_drained_per_iteration); + .def("drain_queue", drainQueueWrapper) + .def("push_residuals", pushResidualsWrapper) + .def("extract_top_k", extractTopKWrapper) + .def("get_nodes_drained_per_iteration", &PPRForwardPushState::getNodesDrainedPerIteration); }