From 6de8c8132af6538528b144ae7f8ee8c000ac40e6 Mon Sep 17 00:00:00 2001
From: liquidsec <paul.mueller08@gmail.com>
Date: Thu, 18 Jun 2026 10:43:33 -0400
Subject: [PATCH] Reduce memory allocation pressure in HttpCompare, wayback,
 and paramminer

- HttpCompare: replace retained blasthttp Response with lightweight snapshot that spills body to BodySpillStore
- wayback: batch YARA junk-URL matching into single blob match with offset mapping instead of per-URL calls
- paramminer: scope already_checked per-URL so word sets are freed after finish() processes each endpoint
---
 bbot/core/helpers/diff.py          | 61 +++++++++++++++++++++++++++++-
 bbot/modules/paramminer_headers.py | 12 +++---
 bbot/modules/wayback.py            | 55 +++++++++++++++++++++++----
 3 files changed, 113 insertions(+), 15 deletions(-)

diff --git a/bbot/core/helpers/diff.py b/bbot/core/helpers/diff.py
index dc258307d6..58e283af3b 100644
--- a/bbot/core/helpers/diff.py
+++ b/bbot/core/helpers/diff.py
@@ -8,6 +8,59 @@
 log = logging.getLogger("bbot.core.helpers.diff")
 
 
+class _BaselineSnapshot:
+    """Lightweight stand-in for a blasthttp Response held by HttpCompare.
+
+    Stores only the fields external code accesses (status_code, headers,
+    text, content) and optionally spills the body to the scan's
+    BodySpillStore so the raw bytes don't pin Python heap memory for the
+    lifetime of the HttpCompare instance.
+
+    The blasthttp Headers object is kept by reference (it survives
+    Response GC independently) to preserve case-insensitive lookups and
+    duplicate-header semantics that DeepDiff relies on.
+    """
+
+    __slots__ = ("status_code", "headers", "_text", "_spill_key", "_spill_store")
+
+    def __init__(self, response, spill_store=None):
+        self.status_code = response.status_code
+        self.headers = response.headers
+        if spill_store is not None:
+            body_bytes = response.body_bytes or b""
+            self._spill_key = f"baseline-{id(self):x}"
+            spill_store.write(self._spill_key, body_bytes)
+            self._spill_store = spill_store
+            self._text = None
+        else:
+            self._text = response.text
+            self._spill_key = None
+            self._spill_store = None
+
+    @property
+    def text(self):
+        if self._text is not None:
+            return self._text
+        if self._spill_store is not None:
+            body = self._spill_store.read(self._spill_key)
+            if body is not None:
+                return body.decode("utf-8", errors="replace")
+        return ""
+
+    @property
+    def content(self):
+        if self._spill_store is not None:
+            return self._spill_store.read(self._spill_key) or b""
+        if self._text is not None:
+            return self._text.encode("utf-8", errors="replace")
+        return b""
+
+    def _cleanup(self):
+        if self._spill_store is not None and self._spill_key is not None:
+            self._spill_store.evict_and_delete(self._spill_key)
+            self._spill_key = None
+
+
 class HttpCompare:
     def __init__(
         self,
@@ -92,7 +145,6 @@ async def _baseline(self):
                 timeout=self.timeout,
             )
 
-            self.baseline = baseline_1
             if baseline_1 is None or baseline_2 is None:
                 log.debug("HTTP error while establishing baseline, aborting")
                 baseline_1_repr = f"HTTP {baseline_1.status_code}" if baseline_1 is not None else "None"
@@ -145,6 +197,13 @@ async def _baseline(self):
                 except Exception as e:
                     log.debug(f"on_baseline_ready callback raised: {e}")
 
+            # Replace the heavy blasthttp Response with a lightweight
+            # snapshot. If the scan has a body_spill_store the body bytes
+            # are written to disk and served from the LRU cache on demand.
+            scan = getattr(self.parent_helper, "scan", None)
+            store = getattr(scan, "body_spill_store", None)
+            self.baseline = _BaselineSnapshot(baseline_1, spill_store=store)
+
     def gen_cache_buster(self):
         return {self.parent_helper.rand_string(6): "1"}
 
diff --git a/bbot/modules/paramminer_headers.py b/bbot/modules/paramminer_headers.py
index aacebe698a..58ffbe946f 100644
--- a/bbot/modules/paramminer_headers.py
+++ b/bbot/modules/paramminer_headers.py
@@ -133,7 +133,7 @@ async def setup_deps(self):
     async def setup(self):
         self.recycle_words = self.config.get("recycle_words", True)
         self.event_dict = {}
-        self.already_checked = set()
+        self.already_checked = {}
 
         # global parameter blacklist (shared with excavate) — known framework/CDN/tracker names
         self.global_blacklist = {p.lower() for p in self.scan.config.get("parameter_blacklist", [])}
@@ -165,10 +165,10 @@ def rand_string(self, *args, **kwargs):
         return self.helpers.rand_string(*args, **kwargs)
 
     async def do_mining(self, wl, url, batch_size, compare_helper):
+        url_checked = self.already_checked.setdefault(url, set())
         for i in wl:
             if i not in self.wl:
-                h = hash(i + url)
-                self.already_checked.add(h)
+                url_checked.add(hash(i))
 
         results = set()
         abort_threshold = 15
@@ -316,10 +316,9 @@ async def finish(self):
             except HttpCompareError as e:
                 self.debug(f"Error initializing compare helper: {e}")
                 continue
+            url_checked = self.already_checked.get(url, set())
             words_to_process = {
-                i
-                for i in self._mutate_for_url(url, self.extracted_words_master)
-                if hash(i + url) not in self.already_checked
+                i for i in self._mutate_for_url(url, self.extracted_words_master) if hash(i) not in url_checked
             }
             try:
                 results = await self.do_mining(words_to_process, url, batch_size, compare_helper)
@@ -327,6 +326,7 @@ async def finish(self):
                 self.debug(f"Encountered HttpCompareError: [{e}] for URL [{url}]")
                 continue
             await self.process_results(event, results)
+            self.already_checked.pop(url, None)
 
     def _incoming_dedup_hash(self, event):
         # dedup by endpoint structure, not full URL string -- value mutations
diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py
index 048e5422a4..184fc5973a 100644
--- a/bbot/modules/wayback.py
+++ b/bbot/modules/wayback.py
@@ -1,4 +1,5 @@
 import re
+import bisect
 from collections import Counter
 from datetime import datetime
 from urllib.parse import parse_qs, urlparse, urlunparse
@@ -100,20 +101,58 @@ def _is_garbage_url(self, url):
         counts = Counter(segments)
         return counts.most_common(1)[0][1] > self._max_path_segment_repeats
 
+    # Per-URL match threshold for the YARA junk-URL rules.
+    # The akamai_bot_manager_url rule requires #strict_seg >= 2.
+    _junk_url_match_threshold = 2
+
     def _filter_urls_sync(self, urls):
-        """Drop blacklisted, garbage, and YARA-junk URLs in one pass. Returns (kept, junk_dropped)."""
-        kept = []
-        junk_dropped = 0
+        """Drop blacklisted, garbage, and YARA-junk URLs in one pass. Returns (kept, junk_dropped).
+
+        YARA matching is batched: all candidate URLs are concatenated into
+        a single blob (newline-delimited) and matched once, then string
+        instance offsets are mapped back to individual URLs and counted
+        per-URL against ``_junk_url_match_threshold``. This replaces the
+        previous per-URL ``rules.match()`` call and cuts allocation
+        pressure from ~22 GB cumulative to a single match invocation.
+        """
+        # Phase 1: cheap filters (blacklist + garbage)
+        candidates = []
         for url in urls:
             if any(bl in url for bl in self.url_blacklist):
                 continue
             if self._is_garbage_url(url):
                 continue
-            if self._junk_url_rules.match(data=url):
-                junk_dropped += 1
-                continue
-            kept.append(url)
-        return kept, junk_dropped
+            candidates.append(url)
+
+        if not candidates:
+            return [], 0
+
+        # Phase 2: batch YARA match on concatenated blob
+        junk_set = set()
+        blob = "\n".join(candidates)
+        matches = self._junk_url_rules.match(data=blob)
+        if matches:
+            offsets = []
+            pos = 0
+            for url in candidates:
+                offsets.append(pos)
+                pos += len(url) + 1
+
+            match_counts = [0] * len(candidates)
+            for m in matches:
+                for s in m.strings:
+                    for instance in s.instances:
+                        idx = bisect.bisect_right(offsets, instance.offset) - 1
+                        if 0 <= idx < len(candidates):
+                            match_counts[idx] += 1
+
+            threshold = self._junk_url_match_threshold
+            for idx, count in enumerate(match_counts):
+                if count >= threshold:
+                    junk_set.add(idx)
+
+        kept = [url for idx, url in enumerate(candidates) if idx not in junk_set]
+        return kept, len(junk_set)
 
     def _is_interesting_file(self, url):
         ext = get_file_extension(url)