-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchunk_heuristics.py
More file actions
62 lines (46 loc) · 1.77 KB
/
chunk_heuristics.py
File metadata and controls
62 lines (46 loc) · 1.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""Lightweight, query-time hints from chunk text — no AST / re-index required."""
from __future__ import annotations
import re
from dataclasses import dataclass
@dataclass
class ChunkHints:
"""Heuristic metadata derived from a chunk's text."""
primary_type_hint: str | None = None
"""First top-level ``class`` / ``interface`` / ``enum`` / ``record`` name in the chunk."""
import_heavy: bool = False
"""True when most lines are ``import`` statements (low semantic density)."""
_JAVA_TYPE = re.compile(
r"\b(?:public\s+|private\s+|protected\s+|sealed\s+|final\s+|abstract\s+|static\s+)*"
r"(?:class|interface|enum|record)\s+([A-Za-z_][A-Za-z0-9_]*)"
)
def analyze_chunk(text: str | None, *, language: str, kind: str) -> ChunkHints:
if not text or not text.strip():
return ChunkHints()
lines = text.strip().split("\n")
n = len(lines)
lang = (language or "").lower()
is_java = kind == "java" or lang == "java"
import_heavy = False
if is_java and n >= 3:
imp = sum(1 for L in lines if L.lstrip().startswith("import "))
import_heavy = imp / n >= 0.55
primary: str | None = None
if is_java:
head = "\n".join(lines[: min(80, n)])
m = _JAVA_TYPE.search(head)
if m:
primary = m.group(1)
return ChunkHints(
primary_type_hint=primary,
import_heavy=import_heavy,
)
def looks_like_code_identifier(query: str) -> bool:
"""PascalCase type name or SCREAMING_SNAKE constant — good candidates for FTS hybrid."""
q = query.strip()
if not q or len(q) < 2 or len(q) > 200:
return False
if re.fullmatch(r"[A-Z][a-zA-Z0-9_]*", q):
return True
if "_" in q and re.fullmatch(r"[A-Z][A-Z0-9_]*", q):
return True
return False