Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 60 additions & 1 deletion bbot/core/helpers/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,59 @@
log = logging.getLogger("bbot.core.helpers.diff")


class _BaselineSnapshot:
"""Lightweight stand-in for a blasthttp Response held by HttpCompare.

Stores only the fields external code accesses (status_code, headers,
text, content) and optionally spills the body to the scan's
BodySpillStore so the raw bytes don't pin Python heap memory for the
lifetime of the HttpCompare instance.

The blasthttp Headers object is kept by reference (it survives
Response GC independently) to preserve case-insensitive lookups and
duplicate-header semantics that DeepDiff relies on.
"""

__slots__ = ("status_code", "headers", "_text", "_spill_key", "_spill_store")

def __init__(self, response, spill_store=None):
self.status_code = response.status_code
self.headers = response.headers
if spill_store is not None:
body_bytes = response.body_bytes or b""
self._spill_key = f"baseline-{id(self):x}"
spill_store.write(self._spill_key, body_bytes)
self._spill_store = spill_store
self._text = None
else:
self._text = response.text
self._spill_key = None
self._spill_store = None

@property
def text(self):
if self._text is not None:
return self._text
if self._spill_store is not None:
body = self._spill_store.read(self._spill_key)
if body is not None:
return body.decode("utf-8", errors="replace")
return ""

@property
def content(self):
if self._spill_store is not None:
return self._spill_store.read(self._spill_key) or b""
if self._text is not None:
return self._text.encode("utf-8", errors="replace")
return b""

def _cleanup(self):
if self._spill_store is not None and self._spill_key is not None:
self._spill_store.evict_and_delete(self._spill_key)
self._spill_key = None


class HttpCompare:
def __init__(
self,
Expand Down Expand Up @@ -92,7 +145,6 @@ async def _baseline(self):
timeout=self.timeout,
)

self.baseline = baseline_1
if baseline_1 is None or baseline_2 is None:
log.debug("HTTP error while establishing baseline, aborting")
baseline_1_repr = f"HTTP {baseline_1.status_code}" if baseline_1 is not None else "None"
Expand Down Expand Up @@ -145,6 +197,13 @@ async def _baseline(self):
except Exception as e:
log.debug(f"on_baseline_ready callback raised: {e}")

# Replace the heavy blasthttp Response with a lightweight
# snapshot. If the scan has a body_spill_store the body bytes
# are written to disk and served from the LRU cache on demand.
scan = getattr(self.parent_helper, "scan", None)
store = getattr(scan, "body_spill_store", None)
self.baseline = _BaselineSnapshot(baseline_1, spill_store=store)

def gen_cache_buster(self):
return {self.parent_helper.rand_string(6): "1"}

Expand Down
12 changes: 6 additions & 6 deletions bbot/modules/paramminer_headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ async def setup_deps(self):
async def setup(self):
self.recycle_words = self.config.get("recycle_words", True)
self.event_dict = {}
self.already_checked = set()
self.already_checked = {}

# global parameter blacklist (shared with excavate) — known framework/CDN/tracker names
self.global_blacklist = {p.lower() for p in self.scan.config.get("parameter_blacklist", [])}
Expand Down Expand Up @@ -165,10 +165,10 @@ def rand_string(self, *args, **kwargs):
return self.helpers.rand_string(*args, **kwargs)

async def do_mining(self, wl, url, batch_size, compare_helper):
url_checked = self.already_checked.setdefault(url, set())
for i in wl:
if i not in self.wl:
h = hash(i + url)
self.already_checked.add(h)
url_checked.add(hash(i))

results = set()
abort_threshold = 15
Expand Down Expand Up @@ -316,17 +316,17 @@ async def finish(self):
except HttpCompareError as e:
self.debug(f"Error initializing compare helper: {e}")
continue
url_checked = self.already_checked.get(url, set())
words_to_process = {
i
for i in self._mutate_for_url(url, self.extracted_words_master)
if hash(i + url) not in self.already_checked
i for i in self._mutate_for_url(url, self.extracted_words_master) if hash(i) not in url_checked
}
try:
results = await self.do_mining(words_to_process, url, batch_size, compare_helper)
except HttpCompareError as e:
self.debug(f"Encountered HttpCompareError: [{e}] for URL [{url}]")
continue
await self.process_results(event, results)
self.already_checked.pop(url, None)

def _incoming_dedup_hash(self, event):
# dedup by endpoint structure, not full URL string -- value mutations
Expand Down
55 changes: 47 additions & 8 deletions bbot/modules/wayback.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import bisect
from collections import Counter
from datetime import datetime
from urllib.parse import parse_qs, urlparse, urlunparse
Expand Down Expand Up @@ -100,20 +101,58 @@ def _is_garbage_url(self, url):
counts = Counter(segments)
return counts.most_common(1)[0][1] > self._max_path_segment_repeats

# Per-URL match threshold for the YARA junk-URL rules.
# The akamai_bot_manager_url rule requires #strict_seg >= 2.
_junk_url_match_threshold = 2

def _filter_urls_sync(self, urls):
"""Drop blacklisted, garbage, and YARA-junk URLs in one pass. Returns (kept, junk_dropped)."""
kept = []
junk_dropped = 0
"""Drop blacklisted, garbage, and YARA-junk URLs in one pass. Returns (kept, junk_dropped).

YARA matching is batched: all candidate URLs are concatenated into
a single blob (newline-delimited) and matched once, then string
instance offsets are mapped back to individual URLs and counted
per-URL against ``_junk_url_match_threshold``. This replaces the
previous per-URL ``rules.match()`` call and cuts allocation
pressure from ~22 GB cumulative to a single match invocation.
"""
# Phase 1: cheap filters (blacklist + garbage)
candidates = []
for url in urls:
if any(bl in url for bl in self.url_blacklist):
continue
if self._is_garbage_url(url):
continue
if self._junk_url_rules.match(data=url):
junk_dropped += 1
continue
kept.append(url)
return kept, junk_dropped
candidates.append(url)

if not candidates:
return [], 0

# Phase 2: batch YARA match on concatenated blob
junk_set = set()
blob = "\n".join(candidates)
matches = self._junk_url_rules.match(data=blob)
if matches:
offsets = []
pos = 0
for url in candidates:
offsets.append(pos)
pos += len(url) + 1

match_counts = [0] * len(candidates)
for m in matches:
for s in m.strings:
for instance in s.instances:
idx = bisect.bisect_right(offsets, instance.offset) - 1
if 0 <= idx < len(candidates):
match_counts[idx] += 1

threshold = self._junk_url_match_threshold
for idx, count in enumerate(match_counts):
if count >= threshold:
junk_set.add(idx)

kept = [url for idx, url in enumerate(candidates) if idx not in junk_set]
return kept, len(junk_set)

def _is_interesting_file(self, url):
ext = get_file_extension(url)
Expand Down