Skip to content

Commit 4dc75e2

Browse files
authored
Split large chunks (#46)
Now chunks over 10k chars (configurable) are split up, to be under 10k chars. Previously these would just be large chunks and the `default_response_max_chars` option (with a default of 20k chars) would mean that the LLM couldn't see them, and generally in my experience the LLM would just guess things as a result. So basically large functions and large whole files are now happy.
1 parent 1dd6bab commit 4dc75e2

File tree

5 files changed

+440
-6
lines changed

5 files changed

+440
-6
lines changed

mcpunk/file_breakdown.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from threading import Lock, Timer
88
from typing import Literal
99

10+
import more_itertools
1011
from git import Repo
1112
from pydantic import (
1213
BaseModel,
@@ -171,6 +172,7 @@ def from_file_contents(
171172
cls,
172173
source_code: str,
173174
file_path: Path,
175+
max_chunk_size: int = 10_000,
174176
) -> "File":
175177
"""Extract all callables, calls and imports from the given source code file."""
176178
chunks: list[Chunk] = []
@@ -180,6 +182,9 @@ def from_file_contents(
180182
if chunker.can_chunk(source_code, file_path):
181183
try:
182184
chunks = chunker(source_code, file_path).chunk_file()
185+
chunks = list(
186+
more_itertools.flatten(x.split(max_size=max_chunk_size) for x in chunks),
187+
)
183188
break
184189
except Exception:
185190
logger.exception(f"Error chunking file {file_path} with {chunker}")
@@ -201,9 +206,11 @@ def __init__(
201206
root: Path,
202207
files_per_parallel_worker: int = 100,
203208
file_watch_refresh_freq_seconds: float = 0.1,
209+
max_chunk_size: int = 10_000,
204210
) -> None:
205211
self.root = root.expanduser().absolute()
206212
self.files_per_parallel_worker = files_per_parallel_worker
213+
self.max_chunk_size = max_chunk_size
207214
self.file_map: dict[Path, File] = {}
208215

209216
git_repo: Repo | None
@@ -241,14 +248,21 @@ def load_files(self, files: list[Path]) -> None:
241248

242249
files_analysed: list[File]
243250
if n_workers == 1:
244-
files_analysed_maybe_none = [_analyze_file(file_path) for file_path in files]
251+
files_analysed_maybe_none = [
252+
_analyze_file(file_path, max_chunk_size=self.max_chunk_size) for file_path in files
253+
]
245254
files_analysed = [x for x in files_analysed_maybe_none if x is not None]
246255
else:
247256
logger.info(f"Using {n_workers} workers to process {len(files)} files")
248257
files_analysed = []
249258
with ProcessPoolExecutor(max_workers=n_workers) as executor:
250259
future_to_file = {
251-
executor.submit(_analyze_file, file_path): file_path for file_path in files
260+
executor.submit(
261+
_analyze_file,
262+
file_path,
263+
max_chunk_size=self.max_chunk_size,
264+
): file_path
265+
for file_path in files
252266
}
253267

254268
for future in as_completed(future_to_file):
@@ -287,7 +301,7 @@ def _init_from_root_dir(self, root: Path) -> None:
287301
self.load_files(files)
288302

289303

290-
def _analyze_file(file_path: Path) -> File | None:
304+
def _analyze_file(file_path: Path, max_chunk_size: int = 10_000) -> File | None:
291305
try:
292306
if not file_path.exists():
293307
logger.warning(f"File {file_path} does not exist")
@@ -296,7 +310,11 @@ def _analyze_file(file_path: Path) -> File | None:
296310
logger.warning(f"File {file_path} is not a file")
297311
return None
298312

299-
return File.from_file_contents(file_path.read_text(), file_path)
313+
return File.from_file_contents(
314+
file_path.read_text(),
315+
file_path,
316+
max_chunk_size=max_chunk_size,
317+
)
300318
except Exception:
301319
logger.exception(f"Error processing file {file_path}")
302320
return None

mcpunk/file_chunk.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,3 +76,81 @@ def matches_filter(
7676
else:
7777
assert_never(filter_on)
7878
return matches_filter(filter_, data)
79+
80+
def split(
81+
self,
82+
max_size: int = 10_000,
83+
split_chunk_prefix: str = (
84+
"[This is a subsection of the chunk. Other parts contain the rest of the chunk]\n\n"
85+
),
86+
) -> list["Chunk"]:
87+
"""Split this chunk into smaller chunks.
88+
89+
This will split the chunk at line boundaries, unless the
90+
line is already longer than max_size.
91+
92+
Args:
93+
max_size: Maximum size in characters for the chunk contents. At least 100.
94+
split_chunk_prefix: Prefix to add the start of each newly created split chunk.
95+
Unused if the chunk is not split. You can set to empty string to
96+
suppress the prefix.
97+
98+
Returns:
99+
List containing either the original chunk (if small enough) or multiple smaller chunks
100+
"""
101+
assert max_size >= 100, "max_size must be at least 100"
102+
# If chunk is small enough, return it as is
103+
if len(self.content) <= max_size:
104+
return [self]
105+
max_size -= len(split_chunk_prefix)
106+
assert max_size > 0, f"{max_size} maybe decrease prefix length"
107+
108+
result: list[Chunk] = []
109+
max_line_size = max_size - 50 # Leave some margin
110+
111+
# Preprocess to split long lines first. This could be avoided, but it does
112+
# make the whole thing a bit simpler as we always know later on that a single line
113+
# will never be longer than max_size.
114+
processed_lines = []
115+
for line in self.content.splitlines(keepends=True):
116+
if len(line) > max_line_size:
117+
# Split the line into chunks of max_line_size
118+
for i in range(0, len(line), max_line_size):
119+
processed_lines.append(line[i : i + max_line_size])
120+
else:
121+
processed_lines.append(line)
122+
123+
# Now split into chunks of max_size
124+
current_content: list[str] = []
125+
current_size = 0
126+
part_num = 1
127+
128+
for line in processed_lines:
129+
# If adding this line would exceed the limit, create a new chunk
130+
if current_size + len(line) > max_size and current_content:
131+
new_chunk = Chunk(
132+
category=self.category,
133+
name=f"{self.name}_part{part_num}",
134+
content=split_chunk_prefix + "".join(current_content),
135+
line=None,
136+
)
137+
result.append(new_chunk)
138+
part_num += 1
139+
current_content = []
140+
current_size = 0
141+
142+
# Add the line to the current chunk
143+
current_content.append(line)
144+
current_size += len(line)
145+
146+
# Add the final chunk if there's anything left
147+
if current_content:
148+
new_chunk = Chunk(
149+
category=self.category,
150+
name=f"{self.name}_part{part_num}",
151+
content=split_chunk_prefix + "".join(current_content),
152+
line=None,
153+
)
154+
result.append(new_chunk)
155+
156+
return result

mcpunk/settings.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ class Settings(BaseSettings):
3636
include_chars_in_response: bool = True
3737
# Maximum number of characters in the response. If the response is longer than this
3838
# then an error will be returned to the caller. This is handy to avoid blowing
39-
# your context.
39+
# your context. HOWEVER this is largely redundant with the max_chunk_size
40+
# option. Likely to be removed in the future.
4041
default_response_max_chars: int = 20_000
4142
# Same as `default_response_max_chars` but for the tool that returns a git diff.
4243
# Generally, git diffs are a bit larger than e.g. a function so nice to have it a
@@ -53,6 +54,11 @@ class Settings(BaseSettings):
5354
# files during save (though this is not a guarantee).
5455
file_watch_refresh_freq_seconds: float = 0.1
5556

57+
# Maximum size of a chunk in characters. If a chunk is larger than this,
58+
# it will be split into multiple chunks. A chunk is something like a function,
59+
# or maybe a whole file (depends on the chunker).
60+
max_chunk_size: int = 10_000
61+
5662
@property
5763
def task_queue_visibility_timeout(self) -> timedelta:
5864
return timedelta(seconds=self.task_queue_visibility_timeout_seconds)

mcpunk/tools.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@ def configure_project(
259259
chunk_project=FileBreakdownProject(
260260
root=path,
261261
file_watch_refresh_freq_seconds=deps.settings().file_watch_refresh_freq_seconds,
262+
max_chunk_size=deps.settings().max_chunk_size,
262263
),
263264
)
264265
PROJECTS[project_name] = project
@@ -372,7 +373,8 @@ def find_matching_chunks_in_file(
372373
- Finding a chunk where a specific function is defined
373374
(e.g. find_matching_chunks_in_file(..., ["def my_funk"])
374375
375-
Returns array of {n: name, t: type, id: identifier, chars: length}
376+
Some chunks are split into multiple parts, because they are too large. This
377+
will look like 'chunkx_part1', 'chunkx_part2', ...
376378
"""
377379
proj_file = ProjectFile(project_name=project_name, rel_path=rel_path)
378380
return _list_chunks_in_file(proj_file, filter_, "name_or_content").render()

0 commit comments

Comments
 (0)