From 831dbd4a868ba8bb80f98b1b5f76207d7eb77240 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Wed, 3 Jun 2026 12:47:52 +0000 Subject: [PATCH] refactor(reranking): Fix O(N^2) complexity in batch rank resolution Co-authored-by: bashandbone <89049923+bashandbone@users.noreply.github.com> --- .jules/bolt.md | 6 +++++- src/codeweaver/providers/reranking/providers/base.py | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.jules/bolt.md b/.jules/bolt.md index 7edb3f3bf..fa031eebd 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -13,7 +13,7 @@ SPDX-License-Identifier: MIT OR Apache-2.0 # 2026-03-29 - Consider Readability and Possible Environment Limitations **Learning** While some patterns are hypothetically faster, they may not improve performance in i/o bound contexts. Examples include embedding/reranking requests and database operations where the dominant limiting factors are i/o constraints. -**Action** Don't recommend changes that reduce readability or diverge from Python idioms for no or marginal gains in performance. +**Action** Don't recommend changes that reduce readability or diverge from Python idioms for no or marginal gains in performance. ## 2026-04-01 - Fast generation of line pos lengths in Chunker with itertools **Learning:** itertools.accumulate(map(len, lines)) is significantly faster (~2-3x) than using a generator expression like (line_offsets[-1] + len(line) for line in lines) because it pushes the entire loop down to C level instead of creating generator overhead for each element. @@ -25,3 +25,7 @@ SPDX-License-Identifier: MIT OR Apache-2.0 ## 2025-04-12 - Walrus Operator Optimization **Learning:** Using the walrus operator inside a list comprehension to avoid redundant execution of string methods (like `.strip()`) is an effective and safe micro-optimization. The result of the assignment inside the list comprehension will intentionally leak into the scope of the caller function, but this standard Python behavior does not cause naming conflicts in non-recursive or non-global scopes. **Action:** Always favor using the walrus operator `:=` in list comprehensions or conditionals when identical string manipulations (e.g., `.strip()`) or expensive evaluation calls appear repeatedly within the identical expression branch. + +## 2026-05-18 - Reranking Processing Loop Algorithmic Complexity +**Learning:** In `src/codeweaver/providers/reranking/providers/base.py`, mapping sequence results using a nested generator comprehension `next((j + 1 for j, (idx, _) in enumerate(mapped_scores) if idx == i), -1)` creates an O(N^2) complexity bottleneck. This degrades performance severely for larger batches of results. +**Action:** When matching items between two arrays or associating ranks to indices, always pre-compute a dictionary (`{idx: j+1 for j, (idx, _) in enumerate(mapped_scores)}`) and use a standard `ranks.get(i)` lookup. This resolves the bottleneck by ensuring O(1) lookups, dropping the overall loop complexity back to O(N). diff --git a/src/codeweaver/providers/reranking/providers/base.py b/src/codeweaver/providers/reranking/providers/base.py index 28e7a93b2..aa05a0fcc 100644 --- a/src/codeweaver/providers/reranking/providers/base.py +++ b/src/codeweaver/providers/reranking/providers/base.py @@ -91,10 +91,12 @@ def default_reranking_output_transformer( mapped_scores = sorted( ((i, score) for i, score in enumerate(results)), key=lambda x: x[1], reverse=True ) + # Optimization: Precompute dictionary outside generator to reduce O(N^2) complexity to O(N) + ranks = {idx: j + 1 for j, (idx, _) in enumerate(mapped_scores)} processed_results.extend( RerankingResult( original_index=i, - batch_rank=next((j + 1 for j, (idx, _) in enumerate(mapped_scores) if idx == i), -1), + batch_rank=ranks.get(i, -1), score=score, chunk=chunk, )