diff --git a/app/api/score/route.ts b/app/api/score/route.ts index 0708056..328ffb5 100644 --- a/app/api/score/route.ts +++ b/app/api/score/route.ts @@ -7,6 +7,7 @@ import { computeScore } from "afdocs"; import { AFDOCS_VERSION } from "@/lib/scoring"; import { inferCategory } from "@/lib/categorize"; import { isBlockedDomain } from "@/lib/blocked-domains"; +import { resolveSlugAlias } from "@/lib/slug-aliases"; export const runtime = "nodejs"; export const maxDuration = 300; @@ -134,13 +135,20 @@ async function detectDocsUrl(url: string): Promise<{ isLikely: boolean; warning? if (DOCS_PATHS.test(pathStr)) return { isLikely: true }; if (DOCS_PLATFORMS.test(host + parsed.pathname)) return { isLikely: true }; + // An llms.txt is a strong docs signal — but marketing sites increasingly ship one + // too (e.g. monday.com serves /llms.txt from its product homepage). For a bare apex + // root it's not sufficient on its own; defer to the homepage content check below so + // a marketing landing page can still be rejected. For any deeper path it stands. + const isRoot = parsed.pathname === "/" || parsed.pathname === ""; + let hasLlms = false; try { const r = await fetch(`${parsed.origin}/llms.txt`, { signal: AbortSignal.timeout(5000), headers: { "User-Agent": "Mozilla/5.0 (compatible; AgentScore/1.0)" }, }); - if (r.ok) return { isLikely: true }; + hasLlms = r.ok; } catch { /* ignore */ } + if (hasLlms && !isRoot) return { isLikely: true }; try { const r = await fetch(url, { @@ -162,6 +170,9 @@ async function detectDocsUrl(url: string): Promise<{ isLikely: boolean; warning? suggestion: `docs.${baseDomain}, ${parsed.origin}/docs, or ${parsed.origin}/api`, }; } catch { + // Couldn't analyze the page — if it advertised an llms.txt, trust that rather + // than reject on a fetch failure (only a *visible* marketing page is rejected). + if (hasLlms) return { isLikely: true }; return { isLikely: false, warning: `Could not fetch the URL — it may be protected by bot-detection.`, @@ -417,21 +428,31 @@ export async function POST(request: Request) { // When the URL has a meaningful path (e.g. docs.nvidia.com/dynamo vs docs.nvidia.com/heavyai), // use the full URL slug so path-scoped sites don't collide on the domain-derived name slug. const urlPath = (() => { try { return new URL(url).pathname.replace(/^\/|\/$/g, ''); } catch { return ''; } })(); - const effectiveSlug = slugParam || (effectiveName && !urlPath ? nameToSlug(effectiveName) : urlToSlug(url)); - console.log("[score] resolved slug:", effectiveSlug, "name:", effectiveName); - - // Return cached result if company already exists (skip when force=true or in development) + // Fern preview/staging hosts (*.ferndocs.com) always slug by URL so they stay distinct from the + // canonical live company entry — otherwise e.g. docusign.ferndocs.com collapses onto the "docusign" slug. + const isFernHost = (() => { try { return /(^|\.)ferndocs\.com$/i.test(new URL(url).hostname); } catch { return false; } })(); + const rawSlug = slugParam || (effectiveName && !urlPath && !isFernHost ? nameToSlug(effectiveName) : urlToSlug(url)); + // Alias a likely-typed domain (e.g. "monday" → "developer-monday-com-api-reference") to a curated + // leaderboard entry. This is a *redirect for lookups only*: we surface the existing canonical entry + // but never score/overwrite it. Actual scoring always stores under the raw slug (see runJob below). + const aliasSlug = resolveSlugAlias(rawSlug); + console.log("[score] resolved slug:", rawSlug, "name:", effectiveName, rawSlug !== aliasSlug ? `(alias → ${aliasSlug})` : ''); + + // Return cached result if company already exists (skip when force=true or in development). + // Prefer the alias target so a typed domain points at the curated entry. if (!force && process.env.NODE_ENV !== 'development') { try { - const existing = await getScoreBySlug(effectiveSlug); + const existing = + (await getScoreBySlug(aliasSlug)) ?? + (aliasSlug !== rawSlug ? await getScoreBySlug(rawSlug) : null); if (existing) { - console.log("[score] company already exists, returning cached result:", effectiveSlug); + console.log("[score] company already exists, returning cached result:", existing.slug); const jobId = crypto.randomUUID(); writeJob(jobId, { status: "complete", score: existing.score, grade: existing.grade, - slug: effectiveSlug, + slug: existing.slug, summary: { total: existing.checks.total, pass: existing.checks.pass, @@ -440,7 +461,7 @@ export async function POST(request: Request) { }, results: existing.results, }); - return NextResponse.json({ jobId, slug: effectiveSlug, cached: true }); + return NextResponse.json({ jobId, slug: existing.slug, cached: true }); } } catch { /* Supabase check failed — proceed with scoring */ } } @@ -468,13 +489,13 @@ export async function POST(request: Request) { console.log("[score] job created:", jobId); if (process.env.NODE_ENV === 'development') { - runJob(jobId, url, effectiveSlug, effectiveName ?? undefined, hidden).catch(console.error); + runJob(jobId, url, rawSlug, effectiveName ?? undefined, hidden).catch(console.error); } else { - waitUntil(runJob(jobId, url, effectiveSlug, effectiveName ?? undefined, hidden)); + waitUntil(runJob(jobId, url, rawSlug, effectiveName ?? undefined, hidden)); } // Set updated rate limit cookie - const response = NextResponse.json({ jobId, slug: effectiveSlug }); + const response = NextResponse.json({ jobId, slug: rawSlug }); response.headers.set('Set-Cookie', buildRateLimitCookie(rlTimestamps)); return response; } catch (error) { diff --git a/app/company/[slug]/page.tsx b/app/company/[slug]/page.tsx index 85a5f74..8210875 100644 --- a/app/company/[slug]/page.tsx +++ b/app/company/[slug]/page.tsx @@ -1,6 +1,7 @@ import { getCompanyWithFallback } from '@/lib/scores'; import type { CheckResult } from '@/lib/scores'; -import { notFound } from 'next/navigation'; +import { notFound, redirect } from 'next/navigation'; +import { resolveSlugAlias } from '@/lib/slug-aliases'; import Link from 'next/link'; import type { Metadata } from 'next'; import ScoreRing from './ScoreRing'; @@ -125,6 +126,10 @@ function buildSummary(company: { name: string; score: number; grade: string; che } export default async function CompanyPage({ params }: { params: { slug: string } }) { + // Redirect known duplicate slugs (e.g. /company/monday) to the canonical leaderboard entry. + const canonical = resolveSlugAlias(params.slug); + if (canonical !== params.slug) redirect(`/agent-score/company/${canonical}`); + const company = await getCompanyWithFallback(params.slug); if (!company) notFound(); diff --git a/lib/blocked-domains.ts b/lib/blocked-domains.ts index d1713cc..bc09fa0 100644 --- a/lib/blocked-domains.ts +++ b/lib/blocked-domains.ts @@ -9,12 +9,21 @@ const BLOCKED_DOMAINS = new Set([ 'porntrex.com', 'anysex.com', 'fuq.com', 'ixxx.com', 'rulertube.com', ]); +// Marketing/landing sites that should never be graded as docs themselves, but whose +// docs subdomains (e.g. developers.monday.com) ARE eligible. Matched on the exact apex +// host only (plus www) — subdomains are NOT blocked. This is the pre-cache, pre-scoring +// guard: it guarantees rejection even if a stale cached row exists or detection flakes. +const BLOCKED_APEX_ONLY = new Set([ + 'monday.com', +]); + export function isBlockedDomain(url: string): boolean { try { const normalized = /^https?:\/\//i.test(url) ? url : `https://${url}`; const { hostname } = new URL(normalized); const host = hostname.replace(/^www\./, '').toLowerCase(); if (BLOCKED_TLDS.has('.' + host.split('.').pop())) return true; + if (BLOCKED_APEX_ONLY.has(host)) return true; if (BLOCKED_DOMAINS.has(host)) return true; for (const d of Array.from(BLOCKED_DOMAINS)) { if (host === d || host.endsWith('.' + d)) return true; diff --git a/lib/categorize.ts b/lib/categorize.ts index 95b6a69..066a30a 100644 --- a/lib/categorize.ts +++ b/lib/categorize.ts @@ -109,14 +109,14 @@ function matchPatterns(docsUrl: string, name?: string): Category | undefined { /** * Ask Claude to classify the company when pattern-matching returns nothing. - * Requires OPENAI_API_KEY in the environment. + * Requires ANTHROPIC_API_KEY in the environment. * Returns null on any failure so the caller can fall back gracefully. */ async function inferCategoryWithLLM( docsUrl: string, name?: string, ): Promise { - const apiKey = process.env.OPENAI_API_KEY; + const apiKey = process.env.ANTHROPIC_API_KEY; if (!apiKey) return null; const prompt = `You are classifying a company's API/developer documentation site into exactly one category. @@ -132,14 +132,15 @@ Rules: - Reply with ONLY the category name, nothing else.`; try { - const res = await fetch('https://api.openai.com/v1/chat/completions', { + const res = await fetch('https://api.anthropic.com/v1/messages', { method: 'POST', headers: { - 'Authorization': `Bearer ${apiKey}`, + 'x-api-key': apiKey, + 'anthropic-version': '2023-06-01', 'content-type': 'application/json', }, body: JSON.stringify({ - model: 'gpt-4o-mini', + model: 'claude-haiku-4-5', max_tokens: 16, messages: [{ role: 'user', content: prompt }], }), @@ -147,12 +148,12 @@ Rules: }); if (!res.ok) { - console.warn('[categorize] OpenAI API error:', res.status); + console.warn('[categorize] Anthropic API error:', res.status); return null; } const data = await res.json(); - const raw = (data?.choices?.[0]?.message?.content ?? '').trim(); + const raw = (data?.content?.[0]?.text ?? '').trim(); const match = CATEGORIES.find( (c) => c.toLowerCase() === raw.toLowerCase(), ); @@ -166,7 +167,7 @@ Rules: /** * Infer a category for a docs URL + company name. * 1. Fast pattern match — if it hits, return immediately (no API call). - * 2. Ask GPT-4o mini if patterns don't match. + * 2. Ask Claude Haiku if patterns don't match. * 3. Fall back to 'Other' if the API call fails or is unavailable. */ export async function inferCategory( diff --git a/lib/slug-aliases.ts b/lib/slug-aliases.ts new file mode 100644 index 0000000..5a6757b --- /dev/null +++ b/lib/slug-aliases.ts @@ -0,0 +1,10 @@ +// Maps an auto-generated/typed slug → the canonical leaderboard slug we want to show. +// Use this when a domain a user is likely to type (e.g. monday.com → "monday") resolves +// to a weaker entry than the curated leaderboard entry we'd rather surface. +export const SLUG_ALIASES: Record = { + // e.g. monday: 'developer-monday-com-api-reference', +}; + +export function resolveSlugAlias(slug: string): string { + return SLUG_ALIASES[slug] ?? slug; +}