Split large chunks (#46)

jurasofish · web-flow · commit 4dc75e22cf57 · 2025-03-02T06:28:59.000Z
Now chunks over 10k chars (configurable) are split up, to be under 10k
chars. Previously these would just be large chunks and the
`default_response_max_chars` option (with a default of 20k chars) would
mean that the LLM couldn't see them, and generally in my experience the
LLM would just guess things as a result. So basically large functions
and large whole files are now happy.
diff --git a/mcpunk/file_breakdown.py b/mcpunk/file_breakdown.py
@@ -7,6 +7,7 @@
 from threading import Lock, Timer
 from typing import Literal
 
+import more_itertools
 from git import Repo
 from pydantic import (
     BaseModel,
@@ -171,6 +172,7 @@ def from_file_contents(
         cls,
         source_code: str,
         file_path: Path,
+        max_chunk_size: int = 10_000,
     ) -> "File":
         """Extract all callables, calls and imports from the given source code file."""
         chunks: list[Chunk] = []
@@ -180,6 +182,9 @@ def from_file_contents(
             if chunker.can_chunk(source_code, file_path):
                 try:
                     chunks = chunker(source_code, file_path).chunk_file()
+                    chunks = list(
+                        more_itertools.flatten(x.split(max_size=max_chunk_size) for x in chunks),
+                    )
                     break
                 except Exception:
                     logger.exception(f"Error chunking file {file_path} with {chunker}")
@@ -201,9 +206,11 @@ def __init__(
         root: Path,
         files_per_parallel_worker: int = 100,
         file_watch_refresh_freq_seconds: float = 0.1,
+        max_chunk_size: int = 10_000,
     ) -> None:
         self.root = root.expanduser().absolute()
         self.files_per_parallel_worker = files_per_parallel_worker
+        self.max_chunk_size = max_chunk_size
         self.file_map: dict[Path, File] = {}
 
         git_repo: Repo | None
@@ -241,14 +248,21 @@ def load_files(self, files: list[Path]) -> None:
 
         files_analysed: list[File]
         if n_workers == 1:
-            files_analysed_maybe_none = [_analyze_file(file_path) for file_path in files]
+            files_analysed_maybe_none = [
+                _analyze_file(file_path, max_chunk_size=self.max_chunk_size) for file_path in files
+            ]
             files_analysed = [x for x in files_analysed_maybe_none if x is not None]
         else:
             logger.info(f"Using {n_workers} workers to process {len(files)} files")
             files_analysed = []
             with ProcessPoolExecutor(max_workers=n_workers) as executor:
                 future_to_file = {
-                    executor.submit(_analyze_file, file_path): file_path for file_path in files
+                    executor.submit(
+                        _analyze_file,
+                        file_path,
+                        max_chunk_size=self.max_chunk_size,
+                    ): file_path
+                    for file_path in files
                 }
 
                 for future in as_completed(future_to_file):
@@ -287,7 +301,7 @@ def _init_from_root_dir(self, root: Path) -> None:
         self.load_files(files)
 
 
-def _analyze_file(file_path: Path) -> File | None:
+def _analyze_file(file_path: Path, max_chunk_size: int = 10_000) -> File | None:
     try:
         if not file_path.exists():
             logger.warning(f"File {file_path} does not exist")
@@ -296,7 +310,11 @@ def _analyze_file(file_path: Path) -> File | None:
             logger.warning(f"File {file_path} is not a file")
             return None
 
-        return File.from_file_contents(file_path.read_text(), file_path)
+        return File.from_file_contents(
+            file_path.read_text(),
+            file_path,
+            max_chunk_size=max_chunk_size,
+        )
     except Exception:
         logger.exception(f"Error processing file {file_path}")
         return None
diff --git a/mcpunk/file_chunk.py b/mcpunk/file_chunk.py
@@ -76,3 +76,81 @@ def matches_filter(
         else:
             assert_never(filter_on)
         return matches_filter(filter_, data)
+
+    def split(
+        self,
+        max_size: int = 10_000,
+        split_chunk_prefix: str = (
+            "[This is a subsection of the chunk. Other parts contain the rest of the chunk]\n\n"
+        ),
+    ) -> list["Chunk"]:
+        """Split this chunk into smaller chunks.
+
+        This will split the chunk at line boundaries, unless the
+        line is already longer than max_size.
+
+        Args:
+            max_size: Maximum size in characters for the chunk contents. At least 100.
+            split_chunk_prefix: Prefix to add the start of each newly created split chunk.
+                Unused if the chunk is not split. You can set to empty string to
+                suppress the prefix.
+
+        Returns:
+            List containing either the original chunk (if small enough) or multiple smaller chunks
+        """
+        assert max_size >= 100, "max_size must be at least 100"
+        # If chunk is small enough, return it as is
+        if len(self.content) <= max_size:
+            return [self]
+        max_size -= len(split_chunk_prefix)
+        assert max_size > 0, f"{max_size} maybe decrease prefix length"
+
+        result: list[Chunk] = []
+        max_line_size = max_size - 50  # Leave some margin
+
+        # Preprocess to split long lines first. This could be avoided, but it does
+        # make the whole thing a bit simpler as we always know later on that a single line
+        # will never be longer than max_size.
+        processed_lines = []
+        for line in self.content.splitlines(keepends=True):
+            if len(line) > max_line_size:
+                # Split the line into chunks of max_line_size
+                for i in range(0, len(line), max_line_size):
+                    processed_lines.append(line[i : i + max_line_size])
+            else:
+                processed_lines.append(line)
+
+        # Now split into chunks of max_size
+        current_content: list[str] = []
+        current_size = 0
+        part_num = 1
+
+        for line in processed_lines:
+            # If adding this line would exceed the limit, create a new chunk
+            if current_size + len(line) > max_size and current_content:
+                new_chunk = Chunk(
+                    category=self.category,
+                    name=f"{self.name}_part{part_num}",
+                    content=split_chunk_prefix + "".join(current_content),
+                    line=None,
+                )
+                result.append(new_chunk)
+                part_num += 1
+                current_content = []
+                current_size = 0
+
+            # Add the line to the current chunk
+            current_content.append(line)
+            current_size += len(line)
+
+        # Add the final chunk if there's anything left
+        if current_content:
+            new_chunk = Chunk(
+                category=self.category,
+                name=f"{self.name}_part{part_num}",
+                content=split_chunk_prefix + "".join(current_content),
+                line=None,
+            )
+            result.append(new_chunk)
+
+        return result
diff --git a/mcpunk/settings.py b/mcpunk/settings.py
@@ -36,7 +36,8 @@ class Settings(BaseSettings):
     include_chars_in_response: bool = True
     # Maximum number of characters in the response. If the response is longer than this
     # then an error will be returned to the caller. This is handy to avoid blowing
-    # your context.
+    # your context. HOWEVER this is largely redundant with the max_chunk_size
+    # option. Likely to be removed in the future.
     default_response_max_chars: int = 20_000
     # Same as `default_response_max_chars` but for the tool that returns a git diff.
     # Generally, git diffs are a bit larger than e.g. a function so nice to have it a
@@ -53,6 +54,11 @@ class Settings(BaseSettings):
     # files during save (though this is not a guarantee).
     file_watch_refresh_freq_seconds: float = 0.1
 
+    # Maximum size of a chunk in characters. If a chunk is larger than this,
+    # it will be split into multiple chunks. A chunk is something like a function,
+    # or maybe a whole file (depends on the chunker).
+    max_chunk_size: int = 10_000
+
     @property
     def task_queue_visibility_timeout(self) -> timedelta:
         return timedelta(seconds=self.task_queue_visibility_timeout_seconds)
diff --git a/mcpunk/tools.py b/mcpunk/tools.py
@@ -259,6 +259,7 @@ def configure_project(
         chunk_project=FileBreakdownProject(
             root=path,
             file_watch_refresh_freq_seconds=deps.settings().file_watch_refresh_freq_seconds,
+            max_chunk_size=deps.settings().max_chunk_size,
         ),
     )
     PROJECTS[project_name] = project
@@ -372,7 +373,8 @@ def find_matching_chunks_in_file(
       - Finding a chunk where a specific function is defined
         (e.g. find_matching_chunks_in_file(..., ["def my_funk"])
 
-    Returns array of {n: name, t: type, id: identifier, chars: length}
+    Some chunks are split into multiple parts, because they are too large. This
+    will look like 'chunkx_part1', 'chunkx_part2', ...
     """
     proj_file = ProjectFile(project_name=project_name, rel_path=rel_path)
     return _list_chunks_in_file(proj_file, filter_, "name_or_content").render()
diff --git a/tests/test_file_chunk.py b/tests/test_file_chunk.py