From 6de8c8132af6538528b144ae7f8ee8c000ac40e6 Mon Sep 17 00:00:00 2001 From: liquidsec Date: Thu, 18 Jun 2026 10:43:33 -0400 Subject: [PATCH] Reduce memory allocation pressure in HttpCompare, wayback, and paramminer - HttpCompare: replace retained blasthttp Response with lightweight snapshot that spills body to BodySpillStore - wayback: batch YARA junk-URL matching into single blob match with offset mapping instead of per-URL calls - paramminer: scope already_checked per-URL so word sets are freed after finish() processes each endpoint --- bbot/core/helpers/diff.py | 61 +++++++++++++++++++++++++++++- bbot/modules/paramminer_headers.py | 12 +++--- bbot/modules/wayback.py | 55 +++++++++++++++++++++++---- 3 files changed, 113 insertions(+), 15 deletions(-) diff --git a/bbot/core/helpers/diff.py b/bbot/core/helpers/diff.py index dc258307d6..58e283af3b 100644 --- a/bbot/core/helpers/diff.py +++ b/bbot/core/helpers/diff.py @@ -8,6 +8,59 @@ log = logging.getLogger("bbot.core.helpers.diff") +class _BaselineSnapshot: + """Lightweight stand-in for a blasthttp Response held by HttpCompare. + + Stores only the fields external code accesses (status_code, headers, + text, content) and optionally spills the body to the scan's + BodySpillStore so the raw bytes don't pin Python heap memory for the + lifetime of the HttpCompare instance. + + The blasthttp Headers object is kept by reference (it survives + Response GC independently) to preserve case-insensitive lookups and + duplicate-header semantics that DeepDiff relies on. + """ + + __slots__ = ("status_code", "headers", "_text", "_spill_key", "_spill_store") + + def __init__(self, response, spill_store=None): + self.status_code = response.status_code + self.headers = response.headers + if spill_store is not None: + body_bytes = response.body_bytes or b"" + self._spill_key = f"baseline-{id(self):x}" + spill_store.write(self._spill_key, body_bytes) + self._spill_store = spill_store + self._text = None + else: + self._text = response.text + self._spill_key = None + self._spill_store = None + + @property + def text(self): + if self._text is not None: + return self._text + if self._spill_store is not None: + body = self._spill_store.read(self._spill_key) + if body is not None: + return body.decode("utf-8", errors="replace") + return "" + + @property + def content(self): + if self._spill_store is not None: + return self._spill_store.read(self._spill_key) or b"" + if self._text is not None: + return self._text.encode("utf-8", errors="replace") + return b"" + + def _cleanup(self): + if self._spill_store is not None and self._spill_key is not None: + self._spill_store.evict_and_delete(self._spill_key) + self._spill_key = None + + class HttpCompare: def __init__( self, @@ -92,7 +145,6 @@ async def _baseline(self): timeout=self.timeout, ) - self.baseline = baseline_1 if baseline_1 is None or baseline_2 is None: log.debug("HTTP error while establishing baseline, aborting") baseline_1_repr = f"HTTP {baseline_1.status_code}" if baseline_1 is not None else "None" @@ -145,6 +197,13 @@ async def _baseline(self): except Exception as e: log.debug(f"on_baseline_ready callback raised: {e}") + # Replace the heavy blasthttp Response with a lightweight + # snapshot. If the scan has a body_spill_store the body bytes + # are written to disk and served from the LRU cache on demand. + scan = getattr(self.parent_helper, "scan", None) + store = getattr(scan, "body_spill_store", None) + self.baseline = _BaselineSnapshot(baseline_1, spill_store=store) + def gen_cache_buster(self): return {self.parent_helper.rand_string(6): "1"} diff --git a/bbot/modules/paramminer_headers.py b/bbot/modules/paramminer_headers.py index aacebe698a..58ffbe946f 100644 --- a/bbot/modules/paramminer_headers.py +++ b/bbot/modules/paramminer_headers.py @@ -133,7 +133,7 @@ async def setup_deps(self): async def setup(self): self.recycle_words = self.config.get("recycle_words", True) self.event_dict = {} - self.already_checked = set() + self.already_checked = {} # global parameter blacklist (shared with excavate) — known framework/CDN/tracker names self.global_blacklist = {p.lower() for p in self.scan.config.get("parameter_blacklist", [])} @@ -165,10 +165,10 @@ def rand_string(self, *args, **kwargs): return self.helpers.rand_string(*args, **kwargs) async def do_mining(self, wl, url, batch_size, compare_helper): + url_checked = self.already_checked.setdefault(url, set()) for i in wl: if i not in self.wl: - h = hash(i + url) - self.already_checked.add(h) + url_checked.add(hash(i)) results = set() abort_threshold = 15 @@ -316,10 +316,9 @@ async def finish(self): except HttpCompareError as e: self.debug(f"Error initializing compare helper: {e}") continue + url_checked = self.already_checked.get(url, set()) words_to_process = { - i - for i in self._mutate_for_url(url, self.extracted_words_master) - if hash(i + url) not in self.already_checked + i for i in self._mutate_for_url(url, self.extracted_words_master) if hash(i) not in url_checked } try: results = await self.do_mining(words_to_process, url, batch_size, compare_helper) @@ -327,6 +326,7 @@ async def finish(self): self.debug(f"Encountered HttpCompareError: [{e}] for URL [{url}]") continue await self.process_results(event, results) + self.already_checked.pop(url, None) def _incoming_dedup_hash(self, event): # dedup by endpoint structure, not full URL string -- value mutations diff --git a/bbot/modules/wayback.py b/bbot/modules/wayback.py index 048e5422a4..184fc5973a 100644 --- a/bbot/modules/wayback.py +++ b/bbot/modules/wayback.py @@ -1,4 +1,5 @@ import re +import bisect from collections import Counter from datetime import datetime from urllib.parse import parse_qs, urlparse, urlunparse @@ -100,20 +101,58 @@ def _is_garbage_url(self, url): counts = Counter(segments) return counts.most_common(1)[0][1] > self._max_path_segment_repeats + # Per-URL match threshold for the YARA junk-URL rules. + # The akamai_bot_manager_url rule requires #strict_seg >= 2. + _junk_url_match_threshold = 2 + def _filter_urls_sync(self, urls): - """Drop blacklisted, garbage, and YARA-junk URLs in one pass. Returns (kept, junk_dropped).""" - kept = [] - junk_dropped = 0 + """Drop blacklisted, garbage, and YARA-junk URLs in one pass. Returns (kept, junk_dropped). + + YARA matching is batched: all candidate URLs are concatenated into + a single blob (newline-delimited) and matched once, then string + instance offsets are mapped back to individual URLs and counted + per-URL against ``_junk_url_match_threshold``. This replaces the + previous per-URL ``rules.match()`` call and cuts allocation + pressure from ~22 GB cumulative to a single match invocation. + """ + # Phase 1: cheap filters (blacklist + garbage) + candidates = [] for url in urls: if any(bl in url for bl in self.url_blacklist): continue if self._is_garbage_url(url): continue - if self._junk_url_rules.match(data=url): - junk_dropped += 1 - continue - kept.append(url) - return kept, junk_dropped + candidates.append(url) + + if not candidates: + return [], 0 + + # Phase 2: batch YARA match on concatenated blob + junk_set = set() + blob = "\n".join(candidates) + matches = self._junk_url_rules.match(data=blob) + if matches: + offsets = [] + pos = 0 + for url in candidates: + offsets.append(pos) + pos += len(url) + 1 + + match_counts = [0] * len(candidates) + for m in matches: + for s in m.strings: + for instance in s.instances: + idx = bisect.bisect_right(offsets, instance.offset) - 1 + if 0 <= idx < len(candidates): + match_counts[idx] += 1 + + threshold = self._junk_url_match_threshold + for idx, count in enumerate(match_counts): + if count >= threshold: + junk_set.add(idx) + + kept = [url for idx, url in enumerate(candidates) if idx not in junk_set] + return kept, len(junk_set) def _is_interesting_file(self, url): ext = get_file_extension(url)