[
  {
    "id": "mistral-7b-instruct",
    "name": "Mistral 7B Instruct",
    "author": "Mistral AI",
    "origin": "fr",
    "params": 7,
    "family": "Mistral",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 16
    },
    "ramCpu": 8,
    "tokSec": {
      "low": 12,
      "mid": 35,
      "high": 90
    },
    "desc": "Mistral AI's breakout 7B instruct model. Still a go-to baseline for fast, low-cost inference and the most fine-tuned open-weight model in the wild.",
    "best": [
      "Bootstrapping a local chatbot on a single consumer GPU",
      "Cheap, high-throughput batch inference where 2024+ reasoning isn't required",
      "Fine-tuning experiments thanks to the deep ecosystem of LoRAs and quants",
      "Edge or on-prem deployments under tight latency budgets",
      "Apache 2.0 commercial use with zero licensing friction"
    ],
    "_en_extras": {
      "strengths": [
        "Excellent quality-to-speed ratio for a 7B",
        "Fully permissive Apache 2.0 license",
        "Mature ecosystem of fine-tunes, GGUFs, and quants",
        "Solid multilingual coverage, including strong French"
      ],
      "weaknesses": [
        "Outclassed on reasoning by 2024+ models like Qwen 2.5 and Llama 3.1",
        "32k context is no longer competitive",
        "Training data cutoff in 2023 shows on recent topics"
      ],
      "verdict": "A reliable, freely licensed workhorse — fine as a baseline, but newer 7Bs win on quality."
    }
  },
  {
    "id": "mistral-small-24b",
    "name": "Mistral Small 3",
    "author": "Mistral AI",
    "origin": "fr",
    "params": 24,
    "family": "Mistral",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "code"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 14,
      "q5": 17,
      "q8": 26,
      "fp16": 48
    },
    "ramCpu": 24,
    "tokSec": {
      "low": 4,
      "mid": 15,
      "high": 40
    },
    "desc": "Mistral AI's 24B dense model that closes most of the gap with 70B-class models. Best quality-per-parameter we've measured at this size in 2025.",
    "best": [
      "Self-hosting a near-frontier assistant on a single 24GB GPU",
      "Agentic workflows and tool calling where latency matters",
      "Long-context RAG with up to 128k tokens",
      "Commercial deployments needing Apache 2.0",
      "Replacing Llama 3 70B to cut VRAM and inference cost"
    ],
    "_en_extras": {
      "strengths": [
        "Quality approaching Llama 3 70B at a third the size",
        "Low latency relative to peers",
        "128k context window",
        "Strong tool use and agent behavior",
        "Apache 2.0 license"
      ],
      "weaknesses": [
        "Needs ~16GB VRAM at Q4, more for higher precision",
        "Trails Qwen 2.5 Coder on dedicated coding tasks",
        "No native vision (see Small 3.1 for that)"
      ],
      "verdict": "The 2025 sweet spot for open-weight chat — frontier-adjacent quality at a tractable size."
    }
  },
  {
    "id": "mixtral-8x7b",
    "name": "Mixtral 8x7B",
    "author": "Mistral AI",
    "origin": "fr",
    "params": 47,
    "family": "Mistral",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "moe"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 26,
      "q5": 32,
      "q8": 50,
      "fp16": 94
    },
    "ramCpu": 48,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 35
    },
    "desc": "The Mistral AI MoE that popularized open-weight sparse models. Eight 7B experts deliver 47B-class output, but you pay 47B-class VRAM costs.",
    "best": [
      "Inference servers with ample VRAM where speed-per-quality matters",
      "Workloads needing strong multilingual and coding performance",
      "Apache 2.0 commercial deployments",
      "Comparative benchmarks against newer dense models",
      "Fine-tuning research on a well-documented MoE architecture"
    ],
    "_en_extras": {
      "strengths": [
        "Quality well above dense models of equivalent active params",
        "Strong coding and multilingual performance",
        "Apache 2.0 license",
        "Battle-tested in production stacks"
      ],
      "weaknesses": [
        "Roughly 26GB VRAM at Q4 — same footprint as a dense 47B",
        "Eclipsed by Qwen 3 and Llama 3.3 in 2025 benchmarks",
        "32k context now feels limiting",
        "Knowledge cutoff predates current tooling"
      ],
      "verdict": "A historically important MoE, now a second-tier choice — newer dense 24-32B models match it for less VRAM."
    }
  },
  {
    "id": "lucie-7b",
    "name": "Lucie 7B",
    "author": "OpenLLM-France",
    "origin": "fr",
    "params": 7,
    "family": "Lucie",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "fr"
    ],
    "ctx": 4096,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 16
    },
    "ramCpu": 8,
    "tokSec": {
      "low": 12,
      "mid": 35,
      "high": 90
    },
    "desc": "A French-sovereign 7B model from OpenLLM-France, backed by CNRS and LINAGORA, with a fully transparent and auditable training corpus.",
    "best": [
      "EU public-sector projects with data sovereignty requirements",
      "French-language content generation and editorial work",
      "Research needing reproducible, openly documented training data",
      "Regulated environments demanding training-data provenance",
      "Demonstrating non-US-trained alternatives to stakeholders"
    ],
    "_en_extras": {
      "strengths": [
        "Full European data sovereignty story",
        "Publicly available training corpus",
        "Strong formal French output",
        "Backed by CNRS and LINAGORA"
      ],
      "weaknesses": [
        "4k context is too short for modern RAG or long docs",
        "Weaker English than Mistral or Llama at the same size",
        "Smaller ecosystem of fine-tunes and tools"
      ],
      "verdict": "Pick it for sovereignty and provenance, not raw capability — the 4k context is the dealbreaker for most workloads."
    }
  },
  {
    "id": "croissant-llm",
    "name": "CroissantLLM 1.3B",
    "author": "CroissantLLM",
    "origin": "fr",
    "params": 1.3,
    "family": "Croissant",
    "license": "MIT",
    "tags": [
      "chat",
      "fr",
      "small"
    ],
    "ctx": 2048,
    "vram": {
      "q4": 1,
      "q5": 1.2,
      "q8": 2,
      "fp16": 3
    },
    "ramCpu": 4,
    "tokSec": {
      "low": 40,
      "mid": 120,
      "high": 250
    },
    "desc": "A 1.3B bilingual French/English model from Sorbonne's MLIA lab, light enough to run on a CPU and shipped with a fully auditable training corpus.",
    "best": [
      "CPU-only or extreme edge deployments",
      "Academic research needing a transparent, reproducible model",
      "Lightweight bilingual French/English classification or completion",
      "Teaching and demos where size and openness matter",
      "Embedded devices with under 2GB of memory"
    ],
    "_en_extras": {
      "strengths": [
        "Runs in roughly 1GB VRAM at Q4",
        "Native French/English balance, not an afterthought",
        "Fully auditable training corpus",
        "Permissive MIT-style licensing"
      ],
      "weaknesses": [
        "2048-token context is too tight for most real tasks",
        "Quality is well below any modern 2025 model",
        "No vision, tools, or chain-of-thought reasoning",
        "Limited ecosystem and tooling support"
      ],
      "verdict": "An academic milestone for transparent bilingual training — not competitive for production use in 2025."
    }
  },
  {
    "id": "llama3-8b",
    "name": "Llama 3.1 8B",
    "author": "Meta",
    "origin": "us",
    "params": 8,
    "family": "Llama",
    "license": "Llama 3 Community",
    "tags": [
      "chat",
      "general"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 6,
      "q5": 7,
      "q8": 10,
      "fp16": 18
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 10,
      "mid": 30,
      "high": 80
    },
    "desc": "Meta's Llama 3.1 8B, the open-weight benchmark of 2024. A 128k context, well-behaved instruction follower with the largest ecosystem in the open-source world.",
    "best": [
      "General-purpose chat or assistant deployments on a single consumer GPU",
      "Long-context RAG up to 128k tokens",
      "Production workloads needing the most mature open-weight tooling",
      "Fine-tuning baselines for downstream tasks",
      "Drop-in replacement for Mistral 7B with longer context"
    ],
    "_en_extras": {
      "strengths": [
        "128k context window",
        "Strong instruction following and coding",
        "Enormous ecosystem of fine-tunes and integrations",
        "Solid quality-to-size ratio"
      ],
      "weaknesses": [
        "Beaten by Qwen 3 8B on most 2025 benchmarks",
        "No vision in this checkpoint",
        "Llama Community license restricts use above 700M MAU"
      ],
      "verdict": "Still a dependable open-weight default, but Qwen 3 8B is the better pick if license terms allow."
    }
  },
  {
    "id": "llama3-70b",
    "name": "Llama 3.1 70B",
    "author": "Meta",
    "origin": "us",
    "params": 70,
    "family": "Llama",
    "license": "Llama 3 Community",
    "tags": [
      "chat",
      "general"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 40,
      "q5": 48,
      "q8": 75,
      "fp16": 140
    },
    "ramCpu": 64,
    "tokSec": {
      "low": 1,
      "mid": 6,
      "high": 20
    },
    "desc": "Meta's Llama 3.1 70B, the open-weight model that first felt like a credible GPT-4 alternative. Needs serious hardware — think dual 3090s or an A100.",
    "best": [
      "On-prem deployments needing frontier-adjacent quality",
      "Long-context reasoning and document workloads up to 128k tokens",
      "Self-hosted alternatives to GPT-4 class APIs",
      "Multi-GPU inference servers already provisioned for 70B-class models",
      "Fine-tuning when you need a strong base for domain adaptation"
    ],
    "_en_extras": {
      "strengths": [
        "Benchmark-leading quality for open-weight 70B",
        "128k context",
        "Strong reasoning and code generation",
        "Mature serving stack in vLLM, TGI, llama.cpp"
      ],
      "weaknesses": [
        "~40GB VRAM at Q4 — minimum two 24GB GPUs",
        "Llama Community license restricts use above 700M MAU",
        "Slower and pricier to serve than Llama 3.3 70B at similar quality"
      ],
      "verdict": "A milestone model, but Llama 3.3 70B delivers the same quality with better post-training — use 3.3 unless you have a reason."
    }
  },
  {
    "id": "llama3-3b",
    "name": "Llama 3.2 3B",
    "author": "Meta",
    "origin": "us",
    "params": 3,
    "family": "Llama",
    "license": "Llama 3 Community",
    "tags": [
      "chat",
      "small"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 2.5,
      "q5": 3,
      "q8": 4.5,
      "fp16": 7
    },
    "ramCpu": 6,
    "tokSec": {
      "low": 25,
      "mid": 70,
      "high": 160
    },
    "desc": "Meta's 3B instruct model with a full 128k context, tuned for laptops, mobile, and edge devices where memory and battery matter.",
    "best": [
      "On-device assistants for laptops, phones, or tablets",
      "CPU-only inference where speed beats raw quality",
      "Long-context summarization on constrained hardware",
      "Latency-critical agent loops",
      "Local autocomplete or text classification"
    ],
    "_en_extras": {
      "strengths": [
        "128k context in a 3B parameter footprint",
        "Fast CPU inference",
        "Strong baseline for edge and mobile use cases",
        "Distilled from larger Llama models for better quality density"
      ],
      "weaknesses": [
        "Noticeably weaker than 7B+ models on complex tasks",
        "No vision in this checkpoint",
        "Subject to Llama Community license terms"
      ],
      "verdict": "The best 3B open-weight model for edge use cases — pick it when memory and latency dominate the brief."
    }
  },
  {
    "id": "qwen25-7b",
    "name": "Qwen 2.5 7B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 7,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "multilingual"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 16
    },
    "ramCpu": 8,
    "tokSec": {
      "low": 12,
      "mid": 35,
      "high": 90
    },
    "desc": "Alibaba's Qwen 2.5 7B, a top-tier 7B for its era with a 128k context, strong multilingual coverage across 29 languages, and Apache 2.0 licensing.",
    "best": [
      "Multilingual assistants beyond English/French",
      "Long-context document Q&A and summarization",
      "General-purpose chat with permissive commercial licensing",
      "Math and code tasks on a single consumer GPU",
      "Drop-in alternative to Llama 3.1 8B"
    ],
    "_en_extras": {
      "strengths": [
        "128k context window",
        "Apache 2.0 license with no MAU restrictions",
        "Strong multilingual performance across 29 languages",
        "Better math and coding than Llama 3.1 8B at the same size"
      ],
      "weaknesses": [
        "Surpassed by Qwen 3 8B in 2025",
        "Trails Qwen 2.5 Coder on dedicated coding tasks",
        "Reasoning weaker than DeepSeek R1 distills"
      ],
      "verdict": "A still-useful general-purpose 7B with permissive licensing — but check Qwen 3 first if you're starting fresh."
    }
  },
  {
    "id": "qwen25-32b",
    "name": "Qwen 2.5 32B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 32,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 19,
      "q5": 23,
      "q8": 35,
      "fp16": 64
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 30
    },
    "desc": "Alibaba's Qwen 2.5 32B, the open-weight 32B reference of late 2024 — matching 70B-class quality on most benchmarks at half the VRAM.",
    "best": [
      "Self-hosted assistants on a single 24GB GPU at lower precision",
      "Long-context reasoning workloads up to 128k tokens",
      "Math and code-heavy pipelines",
      "Commercial deployments needing Apache 2.0",
      "A 70B alternative when VRAM is tight"
    ],
    "_en_extras": {
      "strengths": [
        "Quality on par with many 70B models",
        "128k context",
        "Apache 2.0 license",
        "Strong math, code, and reasoning"
      ],
      "weaknesses": [
        "Needs ~19GB VRAM at Q4 — pushes the limits of a single 24GB card",
        "Outperformed by Qwen 3 32B in 2025",
        "No native vision"
      ],
      "verdict": "A landmark open-weight 32B that's still a strong default — upgrade to Qwen 3 32B when you can."
    }
  },
  {
    "id": "qwen25-coder-7b",
    "name": "Qwen 2.5 Coder 7B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 7,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "code"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 16
    },
    "ramCpu": 8,
    "tokSec": {
      "low": 12,
      "mid": 35,
      "high": 90
    },
    "desc": "A 7B coding specialist from Alibaba covering 92 programming languages with a 128k context. Competitive with proprietary models on HumanEval at this size.",
    "best": [
      "Local IDE autocomplete and inline code suggestions",
      "Code review and refactoring assistants on a consumer GPU",
      "Multi-language codebases needing broad language coverage",
      "Repo-scale Q&A using the 128k window",
      "Cheap, high-throughput code generation pipelines"
    ],
    "_en_extras": {
      "strengths": [
        "Strong HumanEval and code completion for a 7B",
        "128k context for repo-scale prompts",
        "Coverage of 92 programming languages",
        "Apache 2.0 license"
      ],
      "weaknesses": [
        "Beaten clearly by the 32B variant on complex code tasks",
        "Weaker than general 7Bs for non-coding chat",
        "Limited reasoning on multi-step debugging"
      ],
      "verdict": "The right pick when you want a local code model that fits on a single 8GB-class GPU and still pulls its weight."
    }
  },
  {
    "id": "qwen25-coder-32b",
    "name": "Qwen 2.5 Coder 32B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 32,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "code"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 19,
      "q5": 23,
      "q8": 35,
      "fp16": 64
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 30
    },
    "desc": "Alibaba's Qwen 2.5 Coder 32B — the strongest open-weight code model we've benchmarked, trading punches with Claude 3.5 Sonnet on HumanEval.",
    "best": [
      "Self-hosted code copilots replacing proprietary APIs",
      "Repo-scale analysis and refactoring up to 128k tokens",
      "Polyglot codebases spanning dozens of languages",
      "Commercial code tooling needing Apache 2.0",
      "Generating production code where quality justifies the VRAM"
    ],
    "_en_extras": {
      "strengths": [
        "Best-in-class open-weight code generation",
        "Claude 3.5 Sonnet-level HumanEval scores",
        "128k context for repo-wide tasks",
        "Apache 2.0 license"
      ],
      "weaknesses": [
        "Requires 20+ GB VRAM at Q4",
        "Weaker than Qwen 2.5 32B for general chat",
        "Slower than 7B-class models for autocomplete loops"
      ],
      "verdict": "The default open-weight choice for serious code work — frontier-grade quality without an API bill."
    }
  },
  {
    "id": "gemma2-2b",
    "name": "Gemma 2 2B",
    "author": "Google",
    "origin": "us",
    "params": 2,
    "family": "Gemma",
    "license": "Gemma",
    "tags": [
      "chat",
      "small"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 1.8,
      "q5": 2.2,
      "q8": 3.2,
      "fp16": 5
    },
    "ramCpu": 4,
    "tokSec": {
      "low": 35,
      "mid": 100,
      "high": 200
    },
    "desc": "Google's Gemma 2 2B, a compact instruct model distilled from larger Gemmas. Small enough to run on a Raspberry Pi 5 or modest CPU.",
    "best": [
      "Edge devices, microservers, and SBCs",
      "Background tasks where latency beats sophistication",
      "Text classification, simple summarization, and routing",
      "Educational and demo deployments",
      "Fallback model when GPU resources are unavailable"
    ],
    "_en_extras": {
      "strengths": [
        "Runs comfortably in under 2GB VRAM at Q4",
        "Best-in-class 2B quality for its release window",
        "Workable on commodity CPUs",
        "Google's Gemma license permits broad use"
      ],
      "weaknesses": [
        "8k context is restrictive for modern RAG",
        "Falls apart on multi-step reasoning",
        "No vision, no tool calling out of the box"
      ],
      "verdict": "The best 2B for edge and CPU workloads — just don't expect it to reason."
    }
  },
  {
    "id": "gemma2-9b",
    "name": "Gemma 2 9B",
    "author": "Google",
    "origin": "us",
    "params": 9,
    "family": "Gemma",
    "license": "Gemma",
    "tags": [
      "chat",
      "general"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 6,
      "q5": 7.5,
      "q8": 11,
      "fp16": 20
    },
    "ramCpu": 12,
    "tokSec": {
      "low": 9,
      "mid": 28,
      "high": 75
    },
    "desc": "Google's Gemma 2 9B, a distilled instruct model that outperforms Llama 3 8B on several benchmarks at a slightly larger size.",
    "best": [
      "General-purpose chat with stronger output quality than Llama 3 8B",
      "Workloads that don't need a long context window",
      "Instruction-following tasks and structured output",
      "Single consumer GPU deployments",
      "Fine-tuning baselines under Google's Gemma license"
    ],
    "_en_extras": {
      "strengths": [
        "Beats Llama 3 8B on multiple benchmarks",
        "Solid quality-per-parameter",
        "Reliable instruction following",
        "Distilled from Gemma 2 27B for better quality density"
      ],
      "weaknesses": [
        "8k context is the standout limitation",
        "No vision capabilities",
        "Gemma license is more restrictive than Apache 2.0"
      ],
      "verdict": "A strong 9B if you can live with 8k context — otherwise pick Qwen 2.5 7B or Llama 3.1 8B for the 128k window."
    }
  },
  {
    "id": "gemma2-27b",
    "name": "Gemma 2 27B",
    "author": "Google",
    "origin": "us",
    "params": 27,
    "family": "Gemma",
    "license": "Gemma",
    "tags": [
      "chat",
      "general"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 16,
      "q5": 19,
      "q8": 29,
      "fp16": 54
    },
    "ramCpu": 28,
    "tokSec": {
      "low": 3,
      "mid": 13,
      "high": 32
    },
    "desc": "The flagship of the Gemma 2 family from Google. Approaches 70B-class quality on a single 24GB GPU at Q4, with strong multilingual coverage.",
    "best": [
      "Self-hosted assistants needing high quality on one consumer GPU",
      "Multilingual content workflows including French",
      "Instruction-heavy tasks like classification and structured output",
      "Workloads that don't require long context",
      "Replacing 70B models when VRAM is constrained"
    ],
    "_en_extras": {
      "strengths": [
        "Quality close to 70B-class models",
        "Runs in roughly 16GB VRAM at Q4",
        "Strong instruction following",
        "Robust multilingual output"
      ],
      "weaknesses": [
        "8k context is a major handicap in 2025",
        "Gemma license is less permissive than Apache 2.0",
        "No vision in this checkpoint"
      ],
      "verdict": "Excellent raw quality undermined by an 8k context — only pick it when your prompts stay short."
    }
  },
  {
    "id": "phi35-mini",
    "name": "Phi-3.5 Mini",
    "author": "Microsoft",
    "origin": "us",
    "params": 3.8,
    "family": "Phi",
    "license": "MIT",
    "tags": [
      "chat",
      "small"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 10,
      "q5": 12,
      "q8": 18,
      "fp16": 33
    },
    "ramCpu": 12,
    "tokSec": {
      "low": 14,
      "mid": 40,
      "high": 100
    },
    "desc": "Microsoft's Phi-3.5 Mini, a 3.8B model trained on heavily curated synthetic data with a 128k context. Punches above its weight on reasoning.",
    "best": [
      "Long-context tasks on hardware that can't fit a 7B",
      "Reasoning-heavy workloads at small size",
      "MIT-licensed embedded or commercial deployments",
      "Latency-critical assistants on consumer hardware",
      "STEM-focused tutoring and Q&A apps"
    ],
    "_en_extras": {
      "strengths": [
        "128k context in a 3.8B footprint",
        "MIT license with no commercial restrictions",
        "Fast inference on modest hardware",
        "Strong reasoning relative to its size"
      ],
      "weaknesses": [
        "Memory footprint is high for a 3.8B at full context",
        "Outclassed by Phi-4 14B on overall quality",
        "Synthetic-heavy training can show as a narrow knowledge base"
      ],
      "verdict": "A clever small model with a huge context — useful when you need 128k tokens and minimal VRAM."
    }
  },
  {
    "id": "phi4-14b",
    "name": "Phi-4 14B",
    "author": "Microsoft",
    "origin": "us",
    "params": 14,
    "family": "Phi",
    "license": "MIT",
    "tags": [
      "chat",
      "general",
      "reasoning"
    ],
    "ctx": 16384,
    "vram": {
      "q4": 9,
      "q5": 11,
      "q8": 16,
      "fp16": 28
    },
    "ramCpu": 16,
    "tokSec": {
      "low": 6,
      "mid": 20,
      "high": 55
    },
    "desc": "Microsoft's Phi-4 14B, trained on ultra-curated synthetic data with a heavy STEM bias. The 14B reasoning leader at the end of 2024.",
    "best": [
      "Math, science, and structured reasoning workloads",
      "Coding assistants where quality beats context length",
      "MIT-licensed commercial deployments",
      "Mid-size GPU deployments needing strong reasoning",
      "Replacing larger models on STEM-heavy benchmarks"
    ],
    "_en_extras": {
      "strengths": [
        "Top-tier 14B reasoning at release",
        "MIT license",
        "Strong math, science, and code performance",
        "Tight, well-formatted outputs"
      ],
      "weaknesses": [
        "16k context is a significant limitation",
        "Weaker multilingual coverage than Qwen",
        "Narrower world knowledge from synthetic training"
      ],
      "verdict": "The reasoning-focused 14B to pick — just budget around its short context window."
    }
  },
  {
    "id": "deepseek-r1-7b",
    "name": "DeepSeek R1 Distill 7B",
    "author": "DeepSeek",
    "origin": "cn",
    "params": 7,
    "family": "DeepSeek",
    "license": "MIT",
    "tags": [
      "reasoning"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 16
    },
    "ramCpu": 8,
    "tokSec": {
      "low": 12,
      "mid": 35,
      "high": 90
    },
    "desc": "A 7B DeepSeek model distilled from R1 671B with explicit chain-of-thought reasoning. Surprisingly strong on AIME and MATH for its size.",
    "best": [
      "Math, logic, and step-by-step problem solving",
      "Reasoning-heavy tasks on a single consumer GPU",
      "Experimenting with explicit chain-of-thought outputs",
      "MIT-licensed local reasoning assistants",
      "Tutoring and STEM Q&A"
    ],
    "_en_extras": {
      "strengths": [
        "Explicit chain-of-thought reasoning at 7B scale",
        "Strong AIME and MATH scores for its size",
        "32k context",
        "MIT license"
      ],
      "weaknesses": [
        "Very verbose due to thinking tokens",
        "Trails the 32B distill on complex reasoning",
        "Higher token costs per response",
        "Weaker than general 7Bs on casual chat"
      ],
      "verdict": "A capable reasoning-specialist 7B — but bump up to the 32B distill if accuracy matters more than tokens."
    }
  },
  {
    "id": "deepseek-r1-32b",
    "name": "DeepSeek R1 Distill 32B",
    "author": "DeepSeek",
    "origin": "cn",
    "params": 32,
    "family": "DeepSeek",
    "license": "MIT",
    "tags": [
      "reasoning"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 19,
      "q5": 23,
      "q8": 35,
      "fp16": 64
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 30
    },
    "desc": "The 32B DeepSeek R1 distill — the best accessible open-weight reasoner we've tested. Explicit chain-of-thought, MIT-licensed, runs on a single 24GB GPU.",
    "best": [
      "Math, logic, and proof-style problems",
      "Code debugging where explicit reasoning helps",
      "Research workflows needing visible chain-of-thought",
      "Self-hosted alternatives to o1-mini-class APIs",
      "Commercial use under MIT license"
    ],
    "_en_extras": {
      "strengths": [
        "Best open-weight reasoner that fits on one consumer GPU",
        "Excellent math and science performance",
        "Explicit step-by-step thinking",
        "MIT license",
        "32k context"
      ],
      "weaknesses": [
        "Heavy thinking-token output inflates latency and cost",
        "Slow time-to-first-useful-answer",
        "32k context is shorter than most 2025 peers",
        "Overkill for simple chat"
      ],
      "verdict": "The go-to local reasoning model for STEM and code — accept the verbosity, get the accuracy."
    }
  },
  {
    "id": "deepseek-coder-v2-16b",
    "name": "DeepSeek Coder V2 Lite 16B",
    "author": "DeepSeek",
    "origin": "cn",
    "params": 16,
    "family": "DeepSeek",
    "license": "MIT",
    "tags": [
      "code"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 10,
      "q5": 12,
      "q8": 18,
      "fp16": 32
    },
    "ramCpu": 18,
    "tokSec": {
      "low": 5,
      "mid": 18,
      "high": 45
    },
    "desc": "A 16B MoE code specialist from DeepSeek covering 338 programming languages with a 128k context. Fast inference for its quality tier.",
    "best": [
      "Code generation across uncommon or niche languages",
      "Repo-scale code Q&A using the 128k window",
      "Local code assistants where MoE inference speed matters",
      "Bug fixing and refactoring tasks",
      "MIT-licensed code tooling"
    ],
    "_en_extras": {
      "strengths": [
        "128k context for code",
        "MoE architecture keeps inference fast",
        "Coverage of 338 programming languages",
        "Strong code generation and repair"
      ],
      "weaknesses": [
        "Lite version trails the 236B DeepSeek Coder V2 by a wide margin",
        "Beaten by Qwen 2.5 Coder 32B on standard benchmarks",
        "MoE memory footprint is larger than active params suggest"
      ],
      "verdict": "Worth a look for exotic language coverage and speed — Qwen 2.5 Coder 32B still wins on raw quality."
    }
  },
  {
    "id": "llama32-vision-11b",
    "name": "Llama 3.2 Vision 11B",
    "author": "Meta",
    "origin": "us",
    "params": 11,
    "family": "Llama",
    "license": "Llama 3 Community",
    "tags": [
      "vision",
      "chat"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 8,
      "q5": 10,
      "q8": 14,
      "fp16": 24
    },
    "ramCpu": 14,
    "tokSec": {
      "low": 6,
      "mid": 22,
      "high": 55
    },
    "desc": "Meta's first official multimodal Llama. An 11B vision-language model built on Llama 3.1 8B with added image adapters and a 128k text context.",
    "best": [
      "OCR and document understanding on a consumer GPU",
      "Image captioning and description pipelines",
      "Chart and graph analysis",
      "Mixed text-and-image RAG workloads",
      "Llama ecosystem deployments needing vision"
    ],
    "_en_extras": {
      "strengths": [
        "128k text context with image input",
        "Strong OCR and image description",
        "Built on the well-supported Llama 3 base",
        "First-party Meta multimodal release"
      ],
      "weaknesses": [
        "Vision quality trails Qwen2-VL and LLaVA-OneVision",
        "Subject to Llama Community license terms",
        "No video understanding",
        "Image inputs add significant VRAM overhead"
      ],
      "verdict": "A solid Llama-family vision model — but Qwen2-VL is the better open-weight choice when license terms allow."
    }
  },
  {
    "id": "qwen2-vl-7b",
    "name": "Qwen 2 VL 7B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 7,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "vision",
      "chat"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 6,
      "q5": 7,
      "q8": 10,
      "fp16": 18
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 8,
      "mid": 25,
      "high": 60
    },
    "desc": "Alibaba's Qwen 2 VL 7B — a top-tier open-weight vision model with dynamic resolution, multilingual OCR, and short video understanding.",
    "best": [
      "Multilingual OCR and document extraction",
      "High-resolution image analysis up to 16K pixels",
      "Short video understanding and summarization",
      "Chart, diagram, and table parsing",
      "Apache 2.0 commercial vision pipelines"
    ],
    "_en_extras": {
      "strengths": [
        "Dynamic resolution from 20px up to 16K",
        "Best-in-class OCR and document handling at 7B",
        "Apache 2.0 license",
        "Short video input support"
      ],
      "weaknesses": [
        "32k combined text+image context",
        "Outperformed by Qwen3-VL on newer benchmarks",
        "Memory pressure scales fast at high resolutions"
      ],
      "verdict": "The strongest open-weight 7B vision model for OCR and documents — upgrade to Qwen3-VL once it fits your stack."
    }
  },
  {
    "id": "mistral-nemo-12b",
    "name": "Mistral Nemo 12B Instruct",
    "author": "Mistral AI",
    "origin": "fr",
    "params": 12,
    "family": "Mistral",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "multilingual",
      "fr"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 7,
      "q5": 9,
      "q8": 13,
      "fp16": 24
    },
    "ramCpu": 16,
    "tokSec": {
      "low": 8,
      "mid": 25,
      "high": 70
    },
    "desc": "Mistral AI and NVIDIA's co-developed 12B instruct model with 128k context, the Tekken tokenizer, and strong European multilingual coverage.",
    "best": [
      "Multilingual chat across European languages",
      "Long-context summarization and RAG",
      "Replacing Mistral 7B with a noticeable quality bump",
      "Apache 2.0 commercial deployments on a single 24GB GPU",
      "NVIDIA-tuned inference stacks"
    ],
    "_en_extras": {
      "strengths": [
        "128k context window",
        "Strong European multilingual performance",
        "Apache 2.0 license",
        "Efficient Tekken tokenizer reduces token counts"
      ],
      "weaknesses": [
        "Reasoning trails Mistral Small 3.1",
        "No vision",
        "Eclipsed by Small 3 on most general benchmarks"
      ],
      "verdict": "A clean midsize Mistral with great multilingual chops — Small 3.1 wins overall, but Nemo's tokenizer remains attractive."
    }
  },
  {
    "id": "mistral-small-31-24b",
    "name": "Mistral Small 3.1 24B",
    "author": "Mistral AI",
    "origin": "fr",
    "params": 24,
    "family": "Mistral",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "vision",
      "multilingual",
      "fr"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 14,
      "q5": 17,
      "q8": 26,
      "fp16": 48
    },
    "ramCpu": 24,
    "tokSec": {
      "low": 4,
      "mid": 15,
      "high": 40
    },
    "desc": "Mistral AI's Small 3.1 — Small 3 plus a vision encoder, a 128k context, and ~150 tok/s inference under Apache 2.0. Small 3.2 (June 2025) is a drop-in upgrade.",
    "best": [
      "Multimodal assistants needing both text and vision in one model",
      "Long-context RAG over mixed text and image sources",
      "Self-hosted Apache 2.0 deployments on a 24GB GPU",
      "High-throughput inference where latency matters",
      "Replacing separate text and vision models with a single 24B"
    ],
    "_en_extras": {
      "strengths": [
        "Vision and text combined in one 24B model",
        "128k context window",
        "Apache 2.0 license",
        "Around 150 tokens/sec inference"
      ],
      "weaknesses": [
        "Requires Ollama 0.6.5 or newer",
        "Small 3.2 (June 2025) is a marginal improvement worth picking instead",
        "Vision quality trails Qwen2-VL on OCR"
      ],
      "verdict": "The best open-weight 24B multimodal model under Apache 2.0 — and Small 3.2 makes it slightly better still."
    }
  },
  {
    "id": "llama33-70b",
    "name": "Llama 3.3 70B Instruct",
    "author": "Meta",
    "origin": "us",
    "params": 70,
    "family": "Llama",
    "license": "Llama 3.3 Community",
    "tags": [
      "chat",
      "general",
      "reasoning"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 40,
      "q5": 48,
      "q8": 75,
      "fp16": 140
    },
    "ramCpu": 64,
    "tokSec": {
      "low": 1,
      "mid": 6,
      "high": 20
    },
    "desc": "Meta's Llama 3.3 70B — same quality tier as Llama 3.1 405B at one-sixth the size, thanks to improved post-training. Weights are gated on Hugging Face.",
    "best": [
      "Self-hosted alternatives to GPT-4 and Claude APIs",
      "Long-context reasoning and code on multi-GPU servers",
      "Production workloads where 405B is too expensive to run",
      "Domain fine-tuning on a high-quality 70B base",
      "Enterprise deployments cleared under the Llama Community license"
    ],
    "_en_extras": {
      "strengths": [
        "Quality competitive with Llama 3.1 405B",
        "128k context window",
        "Strong reasoning and code performance",
        "Major efficiency gain vs the 405B model"
      ],
      "weaknesses": [
        "Hugging Face access is gated — must accept Meta's terms",
        "Llama Community license restricts use above 700M MAU",
        "No vision capabilities",
        "Still needs roughly 40GB VRAM at Q4"
      ],
      "verdict": "The best open-weight 70B available — pick it over Llama 3.1 70B unless you have a hard reason not to."
    }
  },
  {
    "id": "qwen3-8b",
    "name": "Qwen 3 8B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 8,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "multilingual"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 16
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 12,
      "mid": 35,
      "high": 90
    },
    "desc": "Alibaba's 8B dense model with a toggleable thinking mode and broad multilingual coverage. Punches well above its weight for an 8B and runs comfortably on a single consumer GPU.",
    "best": [
      "You want one local 8B that handles both quick chat and harder reasoning via a thinking toggle",
      "You need multilingual coverage across 100+ languages without paying API fees",
      "You're prototyping agents and want long context (up to 131K with YaRN) on modest hardware",
      "You need an Apache 2.0 model for commercial deployment at the 8B tier"
    ],
    "_en_extras": {
      "strengths": [
        "Hybrid thinking/fast modes switchable per request",
        "Strong multilingual performance across 119 languages",
        "Up to 131K context via YaRN (32K native)",
        "Apache 2.0 — clean commercial use"
      ],
      "weaknesses": [
        "Thinking traces are verbose and burn tokens fast",
        "Ecosystem tooling still less mature than the Qwen 2.5 line"
      ],
      "verdict": "The best general-purpose Apache-licensed 8B for teams that want one model covering chat, reasoning, and 100+ languages."
    }
  },
  {
    "id": "qwen3-14b",
    "name": "Qwen 3 14B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 14,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "multilingual"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 9,
      "q5": 11,
      "q8": 16,
      "fp16": 28
    },
    "ramCpu": 16,
    "tokSec": {
      "low": 6,
      "mid": 20,
      "high": 55
    },
    "desc": "A 14B dense model from Alibaba that matches Qwen 2.5 32B Base on STEM and code, with the same hybrid thinking system as the rest of the Qwen 3 family. The pragmatic sweet spot for a single 24GB GPU.",
    "best": [
      "You have a single 24GB GPU and want the strongest dense Qwen 3 that fits",
      "You need solid STEM and coding performance without jumping to a 32B",
      "You want a toggleable thinking mode for harder problems",
      "You need 131K context for long documents or codebases"
    ],
    "_en_extras": {
      "strengths": [
        "Matches Qwen 2.5 32B Base on STEM and code at less than half the size",
        "Hybrid thinking mode for harder reasoning passes",
        "131K context window",
        "Apache 2.0"
      ],
      "weaknesses": [
        "Still trails dedicated reasoners like QwQ-32B on AIME-class problems",
        "Thinking mode output can balloon for simple prompts"
      ],
      "verdict": "The smartest dense 14B you can run locally — ideal for a single high-end consumer GPU."
    }
  },
  {
    "id": "qwen3-32b",
    "name": "Qwen 3 32B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 32,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "multilingual"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 19,
      "q5": 23,
      "q8": 35,
      "fp16": 64
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 30
    },
    "desc": "Alibaba's 32B dense flagship with thinking mode, scoring 65.5 on MMLU-Pro and 39.8 on SuperGPQA. The strongest general-purpose Qwen 3 dense model before stepping up to the MoE.",
    "best": [
      "You want a single dense model for chat, code, and reasoning on a 48GB-class GPU",
      "You need multilingual coverage with strong reasoning headroom",
      "You want one Apache 2.0 model to standardize on for production",
      "You need 131K context for long-form work"
    ],
    "_en_extras": {
      "strengths": [
        "Strong reasoning with thinking mode enabled",
        "Solid MMLU-Pro and SuperGPQA scores for its size",
        "131K context window",
        "Apache 2.0 license"
      ],
      "weaknesses": [
        "QwQ-32B is sharper for pure reasoning tasks",
        "Verbose thinking traces inflate latency and cost"
      ],
      "verdict": "The most versatile Apache-licensed 32B available — pick this when you want one model for everything."
    }
  },
  {
    "id": "qwen3-235b-a22b",
    "name": "Qwen 3 235B-A22B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 235,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "multilingual",
      "moe"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 142,
      "q5": 170,
      "q8": 250,
      "fp16": 470
    },
    "ramCpu": 160,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 28
    },
    "desc": "Alibaba's flagship MoE — 235B total, 22B active per token across 128 experts. Hits 85.7 on AIME 2024 and 70.7 on LiveCodeBench, putting it in frontier-open territory.",
    "best": [
      "You're running multi-GPU or a high-memory Apple Silicon machine and want frontier-open performance",
      "You need top-tier math and code reasoning under an Apache license",
      "You want MoE-class throughput (22B active) rather than dense 200B+ latency",
      "You're evaluating against closed frontier models and need a serious local baseline"
    ],
    "_en_extras": {
      "strengths": [
        "Frontier-open scores on AIME 2024 (85.7) and LiveCodeBench (70.7)",
        "Only 22B active parameters — fast for its total size",
        "Instruct-2507 and Thinking-2507 variants available",
        "Apache 2.0"
      ],
      "weaknesses": [
        "~142GB at Q4 — needs multi-GPU or a 192GB+ Apple Silicon host",
        "Not realistic for laptop or single-GPU deployment"
      ],
      "verdict": "Pick this when you have the hardware for frontier-open performance under an Apache license."
    }
  },
  {
    "id": "qwen25-vl-7b",
    "name": "Qwen 2.5 VL 7B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 7,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "vision",
      "chat",
      "general"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 6,
      "q5": 7,
      "q8": 10,
      "fp16": 18
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 8,
      "mid": 25,
      "high": 60
    },
    "desc": "A 7B vision-language model from Alibaba with state-of-the-art results in its class, scoring 95.7 on DocVQA. Handles hour-long video, bounding-box grounding, and multilingual OCR.",
    "best": [
      "You need strong document understanding and OCR on a single consumer GPU",
      "You're building pipelines around long video analysis or screenshot Q&A",
      "You need bounding-box grounding or structured JSON output from images",
      "You want commercial-friendly Apache licensing for a VLM"
    ],
    "_en_extras": {
      "strengths": [
        "State-of-the-art vision performance at the 7B tier",
        "Excellent multilingual OCR",
        "Long video input (over 1 hour)",
        "Apache 2.0"
      ],
      "weaknesses": [
        "Requires a VLM-capable backend (Ollama 0.5+ or vLLM)",
        "Smaller than 72B sibling for the hardest visual reasoning"
      ],
      "verdict": "The default open VLM at 7B — best-in-class for document and video work on modest hardware."
    }
  },
  {
    "id": "qwen25-vl-72b",
    "name": "Qwen 2.5 VL 72B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 72,
    "family": "Qwen",
    "license": "Qwen License",
    "tags": [
      "vision",
      "chat",
      "general"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 42,
      "q5": 50,
      "q8": 78,
      "fp16": 144
    },
    "ramCpu": 64,
    "tokSec": {
      "low": 1,
      "mid": 6,
      "high": 20
    },
    "desc": "Frontier-class open vision-language model from Alibaba, scoring 70.2 on MMMU and 88.6 on MMBench. Uses the Qwen License rather than Apache, with a 100M MAU clause.",
    "best": [
      "You need frontier visual reasoning and can run a 70B-class model",
      "You're processing complex documents, charts, or diagrams at scale",
      "You need 128K context for long multimodal sessions",
      "Your product is below the 100M MAU threshold of the Qwen License"
    ],
    "_en_extras": {
      "strengths": [
        "Frontier vision benchmarks (MMMU 70.2)",
        "128K context window",
        "Strong OCR and grounding capabilities"
      ],
      "weaknesses": [
        "Qwen License — not Apache, has a 100M MAU clause",
        "40GB+ VRAM in Q4 — multi-GPU for full precision",
        "Tooling support varies vs the 7B variant"
      ],
      "verdict": "The strongest open VLM available — check the MAU clause before betting your product on it."
    }
  },
  {
    "id": "qwen25-omni-7b",
    "name": "Qwen 2.5 Omni 7B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 7,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "vision",
      "audio",
      "chat"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 6,
      "q5": 7,
      "q8": 10,
      "fp16": 18
    },
    "ramCpu": 12,
    "tokSec": {
      "low": 8,
      "mid": 25,
      "high": 60
    },
    "desc": "Alibaba's first true omni-modal open model — text, image, audio, and video in, with text and speech out. A research-grade preview rather than a production-ready release.",
    "best": [
      "You're researching unified multimodal pipelines and want one model end-to-end",
      "You need speech synthesis alongside text generation in a single model",
      "You're prototyping voice agents that also handle images and video",
      "You're willing to wire up vLLM or transformers directly"
    ],
    "_en_extras": {
      "strengths": [
        "Text, image, audio, and video input in one model",
        "Speech output without a separate TTS",
        "Apache 2.0",
        "Compact 7B footprint"
      ],
      "weaknesses": [
        "No official Ollama tag — community GGUFs only",
        "32K context is short for video-heavy workloads",
        "Early-generation omni model — quality lags specialized stacks"
      ],
      "verdict": "The first credible open omni model — promising for research, but not a drop-in for production yet."
    }
  },
  {
    "id": "qwq-32b",
    "name": "QwQ 32B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 32,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "reasoning"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 19,
      "q5": 23,
      "q8": 35,
      "fp16": 64
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 30
    },
    "desc": "Alibaba's dedicated 32B reasoner, trained with reinforcement learning rather than distillation. Hits 79.5 on AIME24 and 90.6 on MATH-500 — a direct Apache-licensed alternative to DeepSeek R1.",
    "best": [
      "You need a frontier-class reasoner you can run on a single 48GB GPU",
      "You're solving math, logic, or formal problems where chain-of-thought matters",
      "You want an Apache-licensed alternative to DeepSeek R1",
      "You need 131K context for long reasoning traces"
    ],
    "_en_extras": {
      "strengths": [
        "Direct competitor to DeepSeek R1 at a fraction of the size",
        "131K context for long thinking traces",
        "Trained with RL, not just distilled",
        "Apache 2.0"
      ],
      "weaknesses": [
        "Very verbose — token costs add up fast",
        "Requires YaRN for context beyond 8K",
        "Overkill for non-reasoning chat workloads"
      ],
      "verdict": "The best Apache-licensed reasoner you can run on a single GPU."
    }
  },
  {
    "id": "deepseek-r1-671b",
    "name": "DeepSeek R1 671B",
    "author": "DeepSeek",
    "origin": "cn",
    "params": 671,
    "family": "DeepSeek",
    "license": "MIT",
    "tags": [
      "reasoning",
      "moe"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 400,
      "q5": 480,
      "q8": 720,
      "fp16": 1342
    },
    "ramCpu": 512,
    "tokSec": {
      "low": 1,
      "mid": 5,
      "high": 15
    },
    "desc": "The reference open reasoning model — a 671B MoE with 37B active, released under MIT. Scores 97.3 on MATH-500, 79.8 on AIME, and 90.8 on MMLU.",
    "best": [
      "You're running a dedicated inference server and need frontier reasoning",
      "You want the strongest open math, code, and logic model available",
      "You need an MIT-licensed model with no commercial restrictions",
      "You're benchmarking against closed frontier models like o1 or o3"
    ],
    "_en_extras": {
      "strengths": [
        "MIT license — no commercial restrictions",
        "Reference open reasoning model",
        "MATH-500 score of 97.3",
        "R1-0528 update further sharpens reasoning"
      ],
      "weaknesses": [
        "400GB+ in Q4 — server-class hardware required",
        "Out of reach for any single-machine local setup",
        "Very long reasoning traces drive up latency"
      ],
      "verdict": "The open reasoning gold standard — if you have the hardware to host it."
    }
  },
  {
    "id": "deepseek-r1-distill-llama-70b",
    "name": "DeepSeek R1 Distill Llama 70B",
    "author": "DeepSeek",
    "origin": "cn",
    "params": 70,
    "family": "DeepSeek",
    "license": "Llama 3.3 Community + DeepSeek",
    "tags": [
      "reasoning"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 40,
      "q5": 48,
      "q8": 75,
      "fp16": 140
    },
    "ramCpu": 64,
    "tokSec": {
      "low": 1,
      "mid": 6,
      "high": 20
    },
    "desc": "DeepSeek's R1 reasoning behavior distilled into Llama 3.3 70B. Brings frontier-class reasoning down to a single high-end GPU, but inherits both Llama and DeepSeek licenses.",
    "best": [
      "You want R1-style reasoning on a single 80GB GPU or dual 48GB setup",
      "You need 128K context for long chain-of-thought work",
      "You're already deploying Llama 3.3 70B and want a reasoning upgrade",
      "You can comply with both Llama Community and DeepSeek license terms"
    ],
    "_en_extras": {
      "strengths": [
        "Frontier-class reasoning on a single workstation-class GPU",
        "128K context window",
        "Outperforms SFT-only 70B models on hard reasoning",
        "Strong drop-in for existing Llama 70B deployments"
      ],
      "weaknesses": [
        "Dual licensing (Llama 3.3 Community + DeepSeek)",
        "Hugging Face gated access via the Llama base",
        "Trails full R1 671B on the hardest problems"
      ],
      "verdict": "The most practical way to get R1-class reasoning on a single high-end GPU."
    }
  },
  {
    "id": "deepseek-v3-671b",
    "name": "DeepSeek V3 671B",
    "author": "DeepSeek",
    "origin": "cn",
    "params": 671,
    "family": "DeepSeek",
    "license": "DeepSeek License",
    "tags": [
      "chat",
      "general",
      "moe"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 400,
      "q5": 480,
      "q8": 720,
      "fp16": 1342
    },
    "ramCpu": 512,
    "tokSec": {
      "low": 1,
      "mid": 5,
      "high": 15
    },
    "desc": "DeepSeek's frontier-open MoE — 671B total, 37B active — with multi-head latent attention and an auxiliary-loss-free balancing scheme. The V3.1-Terminus update relicenses under MIT.",
    "best": [
      "You're running server-class inference and want frontier-open performance",
      "You need a non-reasoning frontier model for general chat and code at scale",
      "You want the MLA architecture's reduced KV-cache footprint",
      "You can move to V3.1-Terminus for MIT licensing"
    ],
    "_en_extras": {
      "strengths": [
        "Frontier-open performance in chat, code, and general tasks",
        "MLA cuts KV memory significantly vs standard attention",
        "V3.1-Terminus available under MIT",
        "Pretrained on 14.8T tokens"
      ],
      "weaknesses": [
        "Original V3 uses the restrictive DeepSeek License",
        "400GB+ in Q4 — server-class hardware only",
        "Overkill for most workloads under 10B requests/month"
      ],
      "verdict": "Frontier-open performance for teams with serious inference infrastructure — go straight to V3.1-Terminus for the MIT license."
    }
  },
  {
    "id": "gemma3-4b",
    "name": "Gemma 3 4B",
    "author": "Google",
    "origin": "us",
    "params": 4,
    "family": "Gemma",
    "license": "Gemma",
    "tags": [
      "chat",
      "general",
      "vision",
      "multilingual",
      "small"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 10,
      "q5": 12,
      "q8": 18,
      "fp16": 33
    },
    "ramCpu": 12,
    "tokSec": {
      "low": 14,
      "mid": 40,
      "high": 100
    },
    "desc": "Google's compact multimodal 4B with 128K context, vision input, and 140+ language coverage. The smallest Gemma 3 with the full feature set intact.",
    "best": [
      "You need vision and long context on a low-VRAM machine or edge device",
      "You're shipping multilingual apps and need broad language coverage",
      "You want one small model for both text and image inputs",
      "You're prototyping before scaling to 12B or 27B"
    ],
    "_en_extras": {
      "strengths": [
        "Multimodal in a 4B footprint",
        "140+ language coverage",
        "128K context",
        "Sliding-window attention keeps memory in check"
      ],
      "weaknesses": [
        "Gemma License — review terms before commercial use",
        "Trails the 12B and 27B on reasoning and code"
      ],
      "verdict": "The most capable 4B multimodal you can run locally — strong default for resource-constrained deployments."
    }
  },
  {
    "id": "gemma3-12b",
    "name": "Gemma 3 12B",
    "author": "Google",
    "origin": "us",
    "params": 12,
    "family": "Gemma",
    "license": "Gemma",
    "tags": [
      "chat",
      "general",
      "vision",
      "multilingual"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 7,
      "q5": 9,
      "q8": 13,
      "fp16": 24
    },
    "ramCpu": 14,
    "tokSec": {
      "low": 7,
      "mid": 22,
      "high": 60
    },
    "desc": "The 12B sweet spot of Google's Gemma 3 line — multimodal, 128K context, and 140 languages. Fits on a single consumer GPU with room for batching.",
    "best": [
      "You want strong multimodal performance on a 16GB or 24GB GPU",
      "You need long-context summarization or document Q&A with vision",
      "You're shipping a product covering many languages",
      "You want one general-purpose Gemma without going to 27B"
    ],
    "_en_extras": {
      "strengths": [
        "Sweet spot for multimodal performance vs hardware cost",
        "128K context window",
        "140 language coverage",
        "Strong general-purpose default"
      ],
      "weaknesses": [
        "Gemma License rather than Apache",
        "At least 9GB RAM required for Ollama deployment",
        "No dedicated thinking mode"
      ],
      "verdict": "The pragmatic Gemma 3 — most teams should start here before reaching for the 27B."
    }
  },
  {
    "id": "gemma3-27b",
    "name": "Gemma 3 27B",
    "author": "Google",
    "origin": "us",
    "params": 27,
    "family": "Gemma",
    "license": "Gemma",
    "tags": [
      "chat",
      "general",
      "vision",
      "multilingual"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 16,
      "q5": 19,
      "q8": 29,
      "fp16": 54
    },
    "ramCpu": 28,
    "tokSec": {
      "low": 3,
      "mid": 13,
      "high": 32
    },
    "desc": "Google's flagship Gemma 3 at 27B — multimodal, 128K context, and an LMArena Elo of 1338 that beats Llama 3.1 405B at 15x smaller. Sets the bar for open chat under 30B.",
    "best": [
      "You want the strongest open chat model under 30B",
      "You need multimodal input and long context in one model",
      "You're standardizing on Google's open stack",
      "You have a 24GB+ GPU and want frontier-class chat locally"
    ],
    "_en_extras": {
      "strengths": [
        "LMArena Elo 1338 — beats Llama 3.1 405B at 15x smaller",
        "Multimodal with vision input",
        "128K context window",
        "140 language coverage"
      ],
      "weaknesses": [
        "Gemma License rather than Apache",
        "No thinking mode for hard reasoning",
        "Trails dedicated reasoners on math benchmarks"
      ],
      "verdict": "Punch-above-its-weight open chat that quietly outscores models 15x its size."
    }
  },
  {
    "id": "phi4-multimodal",
    "name": "Phi-4 Multimodal 5.6B",
    "author": "Microsoft",
    "origin": "us",
    "params": 5.6,
    "family": "Phi",
    "license": "MIT",
    "tags": [
      "chat",
      "vision",
      "audio",
      "small"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 4,
      "q5": 5,
      "q8": 7,
      "fp16": 12
    },
    "ramCpu": 8,
    "tokSec": {
      "low": 15,
      "mid": 45,
      "high": 110
    },
    "desc": "Microsoft's 5.6B multimodal model — text, image, and audio in, text out — using a Mixture-of-LoRAs design. Accepts roughly 2.8 hours of audio per request.",
    "best": [
      "You're processing long audio recordings on a laptop or edge device",
      "You need lightweight multimodal in an English-first context",
      "You want an MIT-licensed multimodal model with no commercial restrictions",
      "You're prototyping voice + vision pipelines without server-class hardware"
    ],
    "_en_extras": {
      "strengths": [
        "Text, image, and audio input in a 5.6B footprint",
        "MIT license",
        "128K context window",
        "Long audio handling (up to ~2.8 hours)"
      ],
      "weaknesses": [
        "No official Ollama tag",
        "English-first — weaker on other languages",
        "Limited ecosystem tooling vs Qwen VL"
      ],
      "verdict": "The lightest credible audio-capable multimodal under MIT — ideal for transcription-adjacent pipelines on small hardware."
    }
  },
  {
    "id": "phi4-reasoning-14b",
    "name": "Phi-4 Reasoning 14B",
    "author": "Microsoft",
    "origin": "us",
    "params": 14,
    "family": "Phi",
    "license": "MIT",
    "tags": [
      "reasoning"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 9,
      "q5": 11,
      "q8": 16,
      "fp16": 28
    },
    "ramCpu": 16,
    "tokSec": {
      "low": 6,
      "mid": 20,
      "high": 55
    },
    "desc": "Microsoft's 14B reasoner that beats R1-Distill-Llama-70B on AIME and GPQA with 50x fewer parameters. MIT-licensed, English-first, with a 32K context.",
    "best": [
      "You want frontier-class reasoning that fits on a 16GB or 24GB GPU",
      "You need MIT licensing for commercial deployment",
      "You're solving math, science, or logic problems in English",
      "You want to replace a 70B reasoner with something far cheaper to run"
    ],
    "_en_extras": {
      "strengths": [
        "Beats R1-Distill-Llama-70B on AIME and GPQA with 50x fewer parameters",
        "MIT license",
        "Increased RoPE base frequency improves long-form reasoning",
        "Practical hardware footprint for a frontier-class reasoner"
      ],
      "weaknesses": [
        "English-first — weak multilingual performance",
        "Weaker on non-Python code generation",
        "32K context vs 128K on most peers"
      ],
      "verdict": "The most efficient open reasoner you can run on a single consumer GPU."
    }
  },
  {
    "id": "command-r-plus-104b",
    "name": "Command R+ 104B (08-2024)",
    "author": "Cohere",
    "origin": "us",
    "params": 104,
    "family": "Command",
    "license": "CC-BY-NC 4.0",
    "tags": [
      "chat",
      "general",
      "multilingual"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 60,
      "q5": 72,
      "q8": 110,
      "fp16": 208
    },
    "ramCpu": 96,
    "tokSec": {
      "low": 1,
      "mid": 4,
      "high": 15
    },
    "desc": "Cohere's 104B RAG and tool-use flagship from August 2024 — 128K context, 23 languages. Licensed CC-BY-NC, so non-commercial only without a Cohere agreement.",
    "best": [
      "You're building research or internal-only RAG systems",
      "You need top-tier tool-use behavior in an open weights model",
      "You need broad multilingual coverage across 23 languages",
      "You're evaluating before signing a commercial agreement with Cohere"
    ],
    "_en_extras": {
      "strengths": [
        "Best-in-class open RAG and tool-use at release",
        "128K context window",
        "23 language coverage",
        "Higher throughput and lower latency than the April 2024 release"
      ],
      "weaknesses": [
        "CC-BY-NC 4.0 — no commercial use without a separate license",
        "60GB+ VRAM in Q4",
        "Surpassed by newer 100B-class models on general benchmarks"
      ],
      "verdict": "Strong RAG and tool-use, but the non-commercial license rules it out of most production deployments."
    }
  },
  {
    "id": "aya-expanse-8b",
    "name": "Aya Expanse 8B",
    "author": "Cohere For AI",
    "origin": "us",
    "params": 8,
    "family": "Aya",
    "license": "CC-BY-NC 4.0",
    "tags": [
      "chat",
      "general",
      "multilingual"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 16
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 12,
      "mid": 35,
      "high": 90
    },
    "desc": "Cohere For AI's multilingual 8B covering 23 languages, outperforming Gemma 2 9B and Llama 3.1 8B in its language set. CC-BY-NC — non-commercial only.",
    "best": [
      "You're doing multilingual research that doesn't require commercial use",
      "You need strong coverage of low-resource languages at the 8B tier",
      "You're benchmarking against Gemma 2 9B and Llama 3.1 8B on non-English tasks",
      "You're building an internal evaluation harness"
    ],
    "_en_extras": {
      "strengths": [
        "23 language coverage with strong low-resource performance",
        "Beats Gemma 2 9B and Llama 3.1 8B on multilingual benchmarks",
        "Particularly strong on low-resource languages",
        "Compact 8B footprint"
      ],
      "weaknesses": [
        "CC-BY-NC 4.0 — no commercial deployment",
        "Only 8K context",
        "Outclassed by Qwen 3 8B on most general tasks"
      ],
      "verdict": "A strong multilingual research model held back by its non-commercial license."
    }
  },
  {
    "id": "aya-expanse-32b",
    "name": "Aya Expanse 32B",
    "author": "Cohere For AI",
    "origin": "us",
    "params": 32,
    "family": "Aya",
    "license": "CC-BY-NC 4.0",
    "tags": [
      "chat",
      "general",
      "multilingual"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 19,
      "q5": 23,
      "q8": 35,
      "fp16": 64
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 30
    },
    "desc": "The 32B sibling of Aya Expanse from Cohere For AI, delivering a 25% gain on low-resource languages and 89.9% win rate on Dolly vs Mixtral 8x22B. CC-BY-NC.",
    "best": [
      "You're doing high-quality multilingual research at the 30B tier",
      "You need top-tier low-resource language performance",
      "You're comparing against Mixtral 8x22B on multilingual benchmarks",
      "Non-commercial use is acceptable for your project"
    ],
    "_en_extras": {
      "strengths": [
        "25% improvement on low-resource languages vs peers",
        "23 language coverage",
        "89.9% win rate on Dolly vs Mixtral 8x22B",
        "Strong general performance for its size"
      ],
      "weaknesses": [
        "CC-BY-NC 4.0 — no commercial use",
        "Only 8K context window",
        "Newer Qwen 3 models close much of the gap with permissive licenses"
      ],
      "verdict": "The strongest open multilingual 32B for research — license disqualifies it for production."
    }
  },
  {
    "id": "eurollm-9b",
    "name": "EuroLLM 9B Instruct",
    "author": "Utter Project / UE",
    "origin": "eu",
    "params": 9,
    "family": "EuroLLM",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "multilingual",
      "fr"
    ],
    "ctx": 4096,
    "vram": {
      "q4": 6,
      "q5": 7,
      "q8": 10,
      "fp16": 18
    },
    "ramCpu": 12,
    "tokSec": {
      "low": 10,
      "mid": 30,
      "high": 80
    },
    "desc": "An EU-funded 9B (Horizon Europe) covering 35 languages including all 24 official EU ones. Trained on 4T tokens on the MareNostrum5 supercomputer and released under Apache 2.0.",
    "best": [
      "You're building EU-focused products that need sovereignty optics",
      "You need broad coverage of all 24 official EU languages",
      "You want an Apache 2.0 model from a European institution",
      "You're working on policy or public-sector projects with EU procurement"
    ],
    "_en_extras": {
      "strengths": [
        "EU sovereignty — Horizon Europe funded",
        "Apache 2.0 license",
        "Best European open model at its scale",
        "Strong coverage of all 24 official EU languages"
      ],
      "weaknesses": [
        "Only 4K context — short for modern workloads",
        "No official Ollama tag",
        "Outpaced by Qwen 3 8B on general benchmarks"
      ],
      "verdict": "The natural pick when EU sovereignty or procurement requires a European-trained Apache 2.0 model."
    }
  },
  {
    "id": "teuken-7b",
    "name": "Teuken 7B Instruct",
    "author": "OpenGPT-X",
    "origin": "de",
    "params": 7,
    "family": "Teuken",
    "license": "Apache 2.0 (commercial)",
    "tags": [
      "chat",
      "general",
      "multilingual",
      "fr"
    ],
    "ctx": 4096,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 14
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 12,
      "mid": 35,
      "high": 90
    },
    "desc": "A German government-funded (BMWK) 7B from OpenGPT-X covering all 24 EU languages. The commercial variant is Apache 2.0; the research-v0.4 variant has restricted terms.",
    "best": [
      "You need EU sovereignty credentials with German institutional backing",
      "You're building products across all 24 EU languages",
      "You need an Apache-licensed European 7B for commercial deployment",
      "You're working on public-sector or DACH-region projects"
    ],
    "_en_extras": {
      "strengths": [
        "Built for EU sovereignty",
        "Covers all 24 official EU languages",
        "Apache 2.0 commercial variant available",
        "German government (BMWK) backing"
      ],
      "weaknesses": [
        "Only 4K context",
        "No official Ollama tag",
        "Research-v0.4 variant carries a restricted license — check which you grabbed"
      ],
      "verdict": "An EU sovereignty pick with the commercial variant clearly licensed — confirm which variant you're using."
    }
  },
  {
    "id": "pleias-3b",
    "name": "Pleias 3B Preview",
    "author": "PleIAs",
    "origin": "fr",
    "params": 3,
    "family": "Pleias",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "multilingual",
      "fr",
      "small"
    ],
    "ctx": 2048,
    "vram": {
      "q4": 2,
      "q5": 2.5,
      "q8": 3.5,
      "fp16": 6
    },
    "ramCpu": 5,
    "tokSec": {
      "low": 25,
      "mid": 70,
      "high": 160
    },
    "desc": "A French 3B from PleIAs trained 100% on Common Corpus open data, designed for EU AI Act compliance by construction. Fully traceable training data, Apache 2.0.",
    "best": [
      "You need EU AI Act compliance with auditable training data",
      "You're shipping into regulated industries that require provenance",
      "You want a small French-capable model under Apache 2.0",
      "You're building tooling where data lineage matters more than raw benchmarks"
    ],
    "_en_extras": {
      "strengths": [
        "EU AI Act compliant by design",
        "100% traceable training data (Common Corpus)",
        "Apache 2.0",
        "Compact 3B footprint"
      ],
      "weaknesses": [
        "Modest benchmark scores vs general-purpose 3B peers",
        "Context window of only ~2K",
        "Narrower capabilities than mainstream open models"
      ],
      "verdict": "The right pick when training-data provenance and EU compliance matter more than benchmark numbers."
    }
  },
  {
    "id": "pleias-rag-1b",
    "name": "Pleias-RAG 1B",
    "author": "PleIAs",
    "origin": "fr",
    "params": 1.2,
    "family": "Pleias",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "fr",
      "small"
    ],
    "ctx": 2048,
    "vram": {
      "q4": 0.8,
      "q5": 1,
      "q8": 1.5,
      "fp16": 2.5
    },
    "ramCpu": 4,
    "tokSec": {
      "low": 50,
      "mid": 150,
      "high": 280
    },
    "desc": "A 1.2B RAG-specialized model from PleIAs with built-in citation and grounding behavior. Beats most sub-4B small language models on HotPotQA.",
    "best": [
      "You're deploying RAG on tight hardware budgets or edge devices",
      "You need clean citations and grounding from a small model",
      "You're handling structured Q&A where source attribution matters",
      "You want a defensible audit trail for regulated RAG deployments"
    ],
    "_en_extras": {
      "strengths": [
        "Built-in citation and grounding in RAG responses",
        "Outperforms most small language models under 4B on HotPotQA",
        "Runs on lightweight hardware",
        "Apache 2.0"
      ],
      "weaknesses": [
        "Context window of only ~2K",
        "No official Ollama tag",
        "Specialized for RAG — not a general chat model"
      ],
      "verdict": "The most efficient small open model for production RAG with citations."
    }
  },
  {
    "id": "moshi-7b",
    "name": "Moshi 7B",
    "author": "Kyutai",
    "origin": "fr",
    "params": 7.6,
    "family": "Moshi",
    "license": "CC-BY 4.0",
    "tags": [
      "audio",
      "fr"
    ],
    "ctx": 4096,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 15
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 8,
      "mid": 25,
      "high": 60
    },
    "desc": "Kyutai's full-duplex speech model — 7.6B parameters with sub-second latency (~200ms) and two voices, Moshiko and Moshika. A speech architecture, not a text LLM.",
    "best": [
      "You're building real-time voice interfaces and need full-duplex behavior",
      "You need low-latency speech-to-speech without separate TTS and STT",
      "You're researching speech architectures rather than text LLMs",
      "You can run inference directly in PyTorch or via Kyutai's stack"
    ],
    "_en_extras": {
      "strengths": [
        "First open full-duplex speech model",
        "Sub-second latency (~200ms in practice)",
        "Mimi codec at 12.5 Hz / 1.1 kbps on 24 kHz audio",
        "From Kyutai, a respected French AI lab"
      ],
      "weaknesses": [
        "Not a text LLM — different use case entirely",
        "Architecture not supported by Ollama",
        "CC-BY 4.0 license — attribution required"
      ],
      "verdict": "The reference open full-duplex speech model — niche, but the only credible choice in its category."
    }
  },
  {
    "id": "helium-1-2b",
    "name": "Helium 1 2B",
    "author": "Kyutai",
    "origin": "fr",
    "params": 2,
    "family": "Helium",
    "license": "CC-BY-SA 4.0",
    "tags": [
      "chat",
      "general",
      "multilingual",
      "fr",
      "small"
    ],
    "ctx": 4096,
    "vram": {
      "q4": 1.5,
      "q5": 2,
      "q8": 3,
      "fp16": 5
    },
    "ramCpu": 4,
    "tokSec": {
      "low": 35,
      "mid": 100,
      "high": 200
    },
    "desc": "Kyutai's 2B multilingual base covering all 24 EU languages, distilled from Gemma 2 — which means Gemma Terms apply on top of CC-BY-SA. Beats Qwen 2.5 1.5B, Gemma 2B, and Llama 3.2 3B at its scale.",
    "best": [
      "You need a small multilingual base for fine-tuning across EU languages",
      "You're building edge or embedded deployments with French as a priority",
      "You want a European base model with strong sub-3B performance",
      "You're doing pre-training research and need a clean small foundation"
    ],
    "_en_extras": {
      "strengths": [
        "Compact multilingual base from a European lab",
        "Covers all 24 EU languages",
        "Beats Qwen 2.5 1.5B, Gemma 2B, and Llama 3.2 3B at its scale",
        "Built by Kyutai"
      ],
      "weaknesses": [
        "CC-BY-SA 4.0 plus Gemma Terms via distillation",
        "Base model — not instruction-tuned",
        "No official Ollama support"
      ],
      "verdict": "A strong European small base for fine-tuning — just budget for the dual-license obligations."
    }
  },
  {
    "id": "smollm2-17b",
    "name": "SmolLM2 1.7B Instruct",
    "author": "HuggingFace",
    "origin": "fr",
    "params": 1.7,
    "family": "SmolLM",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "small"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 1.2,
      "q5": 1.5,
      "q8": 2.2,
      "fp16": 3.5
    },
    "ramCpu": 4,
    "tokSec": {
      "low": 40,
      "mid": 120,
      "high": 230
    },
    "desc": "HuggingFace's 1.7B Apache 2.0 instruct model trained on 11T tokens. Beats Qwen2.5-1.5B by roughly 6 points on MMLU-Pro, making it a top pick at the sub-2B tier.",
    "best": [
      "On-device assistants where every megabyte counts",
      "Edge inference on CPUs or low-end GPUs",
      "Building permissively licensed downstream products",
      "Fine-tuning experiments on a single consumer GPU",
      "Latency-critical autocomplete or classification tasks"
    ],
    "_en_extras": {
      "strengths": [
        "Best-in-class quality for its size on MMLU-Pro",
        "Clean Apache 2.0 license with no commercial strings",
        "Massive 11T-token training corpus for a small model",
        "One of the most downloaded small models on Hugging Face"
      ],
      "weaknesses": [
        "English-centric, weak on non-English languages",
        "8K context window is tight for modern RAG workflows",
        "BFCL function-calling score of 27% trails larger peers"
      ],
      "verdict": "If you need an Apache-licensed sub-2B model that punches above its weight, SmolLM2 is the default choice."
    }
  },
  {
    "id": "smolvlm2-22b",
    "name": "SmolVLM2 2.2B Instruct",
    "author": "HuggingFace",
    "origin": "fr",
    "params": 2.2,
    "family": "SmolLM",
    "license": "Apache 2.0",
    "tags": [
      "vision",
      "chat",
      "small"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 1.6,
      "q5": 2,
      "q8": 3,
      "fp16": 4.5
    },
    "ramCpu": 5,
    "tokSec": {
      "low": 30,
      "mid": 90,
      "high": 180
    },
    "desc": "HuggingFace's 2.2B vision-language model built on SmolLM2-1.7B, handling image, video, and text in roughly 5.2GB of VRAM. The smallest serious VLM with video understanding.",
    "best": [
      "Adding vision to mobile or embedded apps",
      "Video frame analysis on a single consumer GPU",
      "Document and screenshot understanding at the edge",
      "Permissively licensed multimodal prototypes",
      "Bandwidth-constrained deployments needing local VLM"
    ],
    "_en_extras": {
      "strengths": [
        "Runs full video inference in ~5.2GB VRAM",
        "Apache 2.0 license suitable for commercial use",
        "Genuine image + video + text capability at 2.2B scale",
        "Inherits SmolLM2's tight text fundamentals"
      ],
      "weaknesses": [
        "8K context inherited from SmolLM2 limits long video",
        "No official Ollama distribution yet",
        "Video understanding is basic compared to frontier VLMs"
      ],
      "verdict": "The go-to small VLM when you need vision plus video in under 3B parameters and an Apache license."
    }
  },
  {
    "id": "glm-51",
    "name": "GLM-5.1",
    "author": "Z.AI",
    "origin": "cn",
    "params": 744,
    "family": "GLM",
    "license": "MIT",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "multilingual",
      "moe"
    ],
    "ctx": 200000,
    "vram": {
      "q4": 445,
      "q5": 535,
      "q8": 800,
      "fp16": 1488
    },
    "ramCpu": 512,
    "tokSec": {
      "low": 1,
      "mid": 5,
      "high": 15
    },
    "desc": "Z.AI's flagship MoE with 744B total and 40B active parameters under an MIT license. Ranked #1 open-weight model on Artificial Analysis as of April 2026.",
    "best": [
      "Production agentic systems on dedicated server clusters",
      "Replacing closed frontier APIs with self-hosted weights",
      "Long-context document analysis up to 200K tokens",
      "Open-weight SWE-Bench-grade coding agents",
      "Commercial deployments that need MIT licensing"
    ],
    "_en_extras": {
      "strengths": [
        "#1 open-weight model on Artificial Analysis (April 2026)",
        "58.4 on SWE-Bench Pro, leading all open weights",
        "200K context for whole-repo reasoning",
        "True MIT license with full commercial rights"
      ],
      "weaknesses": [
        "445GB+ in Q4 quantization requires a multi-GPU server",
        "No official Ollama tag at launch",
        "Operational complexity rules out single-workstation use"
      ],
      "verdict": "The strongest open-weight model available today, provided you have the hardware to run a 744B MoE."
    }
  },
  {
    "id": "minimax-m27",
    "name": "MiniMax-M2.7",
    "author": "MiniMax",
    "origin": "cn",
    "params": 229,
    "family": "MiniMax",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "moe"
    ],
    "ctx": 205000,
    "vram": {
      "q4": 138,
      "q5": 165,
      "q8": 246,
      "fp16": 458
    },
    "ramCpu": 160,
    "tokSec": {
      "low": 2,
      "mid": 10,
      "high": 25
    },
    "desc": "MiniMax's agentic MoE with 229B total and 10B active parameters under Apache 2.0. Open-sourced April 12, 2026 and currently the top trending model on Hugging Face.",
    "best": [
      "Multi-step agentic workflows and tool use",
      "Terminal and shell automation pipelines",
      "SWE-Bench-style autonomous coding agents",
      "Long-context tasks needing 200K+ tokens",
      "Commercial agent products under Apache 2.0"
    ],
    "_en_extras": {
      "strengths": [
        "State-of-the-art open agentic performance",
        "56.22% SWE-Bench Pro and 57% Terminal Bench",
        "Only 10B active parameters keeps inference fast",
        "Clean Apache 2.0 license",
        "#1 trending on Hugging Face at launch"
      ],
      "weaknesses": [
        "138GB+ in Q4 demands serious server hardware",
        "Verbose output in agent mode inflates token costs",
        "Newer release means thinner tooling ecosystem"
      ],
      "verdict": "The most exciting agentic open-weight model of 2026, if your hardware can host 229B parameters."
    }
  },
  {
    "id": "gemma4-31b",
    "name": "Gemma 4 31B",
    "author": "Google",
    "origin": "us",
    "params": 31,
    "family": "Gemma",
    "license": "Gemma",
    "tags": [
      "chat",
      "general",
      "vision",
      "audio",
      "multilingual"
    ],
    "ctx": 256000,
    "vram": {
      "q4": 18,
      "q5": 22,
      "q8": 33,
      "fp16": 62
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 30
    },
    "desc": "Google's dense 31B multimodal model with native text, image, and audio support across 140+ languages. Ranked #3 on Chatbot Arena's open leaderboard with a 256K context window.",
    "best": [
      "Multilingual production apps spanning 100+ languages",
      "Native audio input and analysis workflows",
      "Long-context document and codebase analysis",
      "On-prem multimodal chat backends",
      "Replacing GPT-4o-class APIs with local weights"
    ],
    "_en_extras": {
      "strengths": [
        "#3 on Chatbot Arena's open leaderboard",
        "Native audio understanding, not just text-to-image",
        "256K context window in a dense 31B model",
        "Strong coverage across 140+ languages",
        "Backed by Google's training infrastructure"
      ],
      "weaknesses": [
        "Gemma license is more restrictive than Apache 2.0",
        "31B dense model needs ~20GB VRAM in Q4",
        "Audio quality trails purpose-built ASR models"
      ],
      "verdict": "The best open multimodal generalist of the Gemma line, assuming you can live with the Gemma license."
    }
  },
  {
    "id": "gemma4-e4b",
    "name": "Gemma 4 E4B",
    "author": "Google",
    "origin": "us",
    "params": 4,
    "family": "Gemma",
    "license": "Gemma",
    "tags": [
      "chat",
      "general",
      "vision",
      "audio",
      "multilingual",
      "small"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 10,
      "q5": 12,
      "q8": 18,
      "fp16": 33
    },
    "ramCpu": 12,
    "tokSec": {
      "low": 14,
      "mid": 40,
      "high": 100
    },
    "desc": "Google's 4B-effective multimodal Gemma variant tuned for laptops and edge devices, handling text, image, and audio across 140 languages with a 128K context.",
    "best": [
      "Multimodal apps running on laptops or mobile",
      "Offline assistants that need image and audio input",
      "Multilingual edge deployments",
      "Low-power on-device inference",
      "Prototyping multimodal flows before scaling up"
    ],
    "_en_extras": {
      "strengths": [
        "Full text + image + audio in a 4B model",
        "Runs comfortably on laptops and high-end phones",
        "128K context is generous for the size class",
        "140-language coverage in a small footprint"
      ],
      "weaknesses": [
        "Gemma license restricts some commercial uses",
        "Quality clearly trails 12B+ multimodal models",
        "Audio reasoning is functional but not robust"
      ],
      "verdict": "The most capable sub-5B multimodal model for edge deployments, with the usual Gemma license caveats."
    }
  },
  {
    "id": "qwen35-9b",
    "name": "Qwen 3.5 9B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 9,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "multilingual"
    ],
    "ctx": 262000,
    "vram": {
      "q4": 6,
      "q5": 7,
      "q8": 10,
      "fp16": 18
    },
    "ramCpu": 12,
    "tokSec": {
      "low": 9,
      "mid": 28,
      "high": 75
    },
    "desc": "Alibaba's next-generation dense 9B model with a 262K native context window and an improved toggleable thinking mode. Apache 2.0 licensed.",
    "best": [
      "Long-document analysis without RAG",
      "Multilingual assistants covering 119 languages",
      "Switching between fast and deep reasoning per request",
      "Single-GPU production deployments",
      "Permissive commercial use cases"
    ],
    "_en_extras": {
      "strengths": [
        "262K native context in a 9B parameter model",
        "Toggleable thinking mode for cost control",
        "Strong multilingual performance across 119 languages",
        "Apache 2.0 license"
      ],
      "weaknesses": [
        "Fine-tune ecosystem is still less mature than Qwen 2.5",
        "Thinking mode can be verbose by default"
      ],
      "verdict": "The best long-context Apache-licensed 9B today, especially if you need toggleable reasoning."
    }
  },
  {
    "id": "qwen35-27b",
    "name": "Qwen 3.5 27B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 27,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "multilingual"
    ],
    "ctx": 262000,
    "vram": {
      "q4": 16,
      "q5": 19,
      "q8": 29,
      "fp16": 54
    },
    "ramCpu": 28,
    "tokSec": {
      "low": 3,
      "mid": 13,
      "high": 32
    },
    "desc": "Alibaba's dense 27B Qwen 3.5 with a 262K context window and calibrated thinking mode. One of the best quality-to-size trade-offs in the open 25B-30B class.",
    "best": [
      "Math, science, and STEM-heavy reasoning",
      "Long-context analysis at 100K+ tokens",
      "Single high-end GPU production deployments",
      "Multilingual technical workloads",
      "Replacing closed mid-tier APIs"
    ],
    "_en_extras": {
      "strengths": [
        "262K native context window",
        "Well-calibrated thinking mode",
        "Strong math and science reasoning",
        "Apache 2.0 license"
      ],
      "weaknesses": [
        "Needs ~16GB VRAM in Q4",
        "Gemma 3 27B is a close competitor",
        "Thinking mode adds latency on simple queries"
      ],
      "verdict": "The best Apache-licensed dense model in the 27B class for long-context reasoning."
    }
  },
  {
    "id": "qwen35-397b-a17b",
    "name": "Qwen 3.5 397B-A17B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 397,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "multilingual",
      "moe"
    ],
    "ctx": 262000,
    "vram": {
      "q4": 240,
      "q5": 285,
      "q8": 425,
      "fp16": 794
    },
    "ramCpu": 280,
    "tokSec": {
      "low": 2,
      "mid": 7,
      "high": 20
    },
    "desc": "Alibaba's flagship MoE with 397B total and 17B active parameters, ranked #5 open-weight on Artificial Analysis. Apache 2.0 with a 262K context.",
    "best": [
      "Top-tier open-weight performance on a multi-GPU server",
      "Long-context enterprise workloads",
      "Replacing closed frontier models with self-hosted weights",
      "Commercial deployments needing Apache licensing"
    ],
    "_en_extras": {
      "strengths": [
        "#5 on Artificial Analysis's open leaderboard",
        "262K context window",
        "Only 17B active parameters keeps inference efficient",
        "Apache 2.0 license"
      ],
      "weaknesses": [
        "240GB+ in Q4 demands a multi-GPU server",
        "MoE deployment adds operational complexity",
        "Beaten by GLM-5.1 and MiniMax-M2.7 on key benchmarks"
      ],
      "verdict": "A strong flagship MoE with permissive licensing, though no longer the top of the open leaderboard."
    }
  },
  {
    "id": "qwen36-35b-a3b",
    "name": "Qwen 3.6 35B-A3B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 35,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "code",
      "reasoning",
      "moe"
    ],
    "ctx": 262000,
    "vram": {
      "q4": 21,
      "q5": 25,
      "q8": 38,
      "fp16": 70
    },
    "ramCpu": 28,
    "tokSec": {
      "low": 8,
      "mid": 22,
      "high": 60
    },
    "desc": "Alibaba's agentic coding MoE with 35B total and just 3B active parameters, released April 16, 2026. Scores 73.4% on SWE-Bench while running on a single 24GB GPU.",
    "best": [
      "Local SWE-Bench-grade coding agents",
      "Single 24GB GPU coding workstations",
      "Repository-scale refactoring with 262K context",
      "Cost-sensitive autonomous coding pipelines",
      "Commercial code assistants under Apache 2.0"
    ],
    "_en_extras": {
      "strengths": [
        "73.4% SWE-Bench in an MoE that fits on a 24GB GPU",
        "Only 3B active parameters means fast inference",
        "262K context handles whole repos",
        "Apache 2.0 license"
      ],
      "weaknesses": [
        "No official Ollama tag yet",
        "Brand-new release with limited production track record",
        "Specialized for coding, weaker as a general chat model"
      ],
      "verdict": "The best local coding agent for a single 24GB GPU as of April 2026."
    }
  },
  {
    "id": "qwen3-coder-next",
    "name": "Qwen3-Coder-Next 80B-A3B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 80,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "code",
      "moe"
    ],
    "ctx": 262000,
    "vram": {
      "q4": 48,
      "q5": 58,
      "q8": 86,
      "fp16": 160
    },
    "ramCpu": 72,
    "tokSec": {
      "low": 5,
      "mid": 18,
      "high": 50
    },
    "desc": "Alibaba's hybrid Gated DeltaNet + Attention MoE with 80B total and 3B active parameters. Purpose-built as a local coding copilot that fits on a 24GB GPU.",
    "best": [
      "Local Copilot-style code completion",
      "Long-context refactoring up to 262K tokens",
      "IDE plugins running on consumer hardware",
      "Apache-licensed commercial code tooling",
      "Reducing reliance on cloud coding APIs"
    ],
    "_en_extras": {
      "strengths": [
        "Runs as a local copilot on a 24GB GPU",
        "262K context fits entire codebases",
        "Hybrid architecture keeps memory low",
        "Apache 2.0 license"
      ],
      "weaknesses": [
        "Hybrid architecture means partial llama.cpp support",
        "Less mature than dense coder alternatives",
        "Tooling lags behind standard transformer models"
      ],
      "verdict": "Choose this when you want a local Copilot replacement and can tolerate early-stage tooling friction."
    }
  },
  {
    "id": "mistral-small-4",
    "name": "Mistral Small 4",
    "author": "Mistral AI",
    "origin": "fr",
    "params": 119,
    "family": "Mistral",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "code",
      "vision",
      "reasoning",
      "multilingual",
      "fr",
      "moe"
    ],
    "ctx": 256000,
    "vram": {
      "q4": 72,
      "q5": 86,
      "q8": 128,
      "fp16": 238
    },
    "ramCpu": 96,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 30
    },
    "desc": "Mistral AI's 2026 flagship MoE with 119B total and 6.5B active parameters, unifying chat, reasoning, vision, and code in a single Apache 2.0 model.",
    "best": [
      "Consolidating multiple Mistral deployments into one model",
      "Vision plus reasoning workloads on a prosumer rig",
      "Long-context analysis up to 256K tokens",
      "European-data-sovereignty deployments",
      "Apache-licensed commercial products"
    ],
    "_en_extras": {
      "strengths": [
        "Unifies chat, reasoning, vision, and code in one model",
        "Only 6.5B active parameters for fast inference",
        "256K context window",
        "Apache 2.0 license",
        "European lab with strong French and EU-language support"
      ],
      "weaknesses": [
        "72GB+ in Q4 requires a prosumer multi-GPU setup",
        "Breaks continuity with the Small 3.x line",
        "Newer release means thinner ecosystem"
      ],
      "verdict": "Mistral's most ambitious open release yet, ideal if you want one model covering four product lines."
    }
  },
  {
    "id": "devstral-small-2",
    "name": "Devstral Small 2 24B",
    "author": "Mistral AI",
    "origin": "fr",
    "params": 24,
    "family": "Mistral",
    "license": "Apache 2.0",
    "tags": [
      "code",
      "fr"
    ],
    "ctx": 256000,
    "vram": {
      "q4": 14,
      "q5": 17,
      "q8": 26,
      "fp16": 48
    },
    "ramCpu": 24,
    "tokSec": {
      "low": 4,
      "mid": 15,
      "high": 40
    },
    "desc": "Mistral AI's 24B coding specialist co-developed with All Hands AI, scoring 72.2% on SWE-Bench under Apache 2.0. Fits on a single RTX 4090.",
    "best": [
      "Single-GPU coding agents on a 4090",
      "Repository-scale refactoring up to 256K tokens",
      "SWE-Bench-style autonomous coding tasks",
      "Apache-licensed commercial code tools",
      "European-lab-sourced coding infrastructure"
    ],
    "_en_extras": {
      "strengths": [
        "72.2% SWE-Bench in a 24B dense model",
        "Runs comfortably on a single RTX 4090",
        "256K context for whole-repo work",
        "Apache 2.0 license",
        "Co-developed with All Hands AI for agent workloads"
      ],
      "weaknesses": [
        "No vision capability",
        "Specialized for code, weaker as a general assistant"
      ],
      "verdict": "The strongest Apache-licensed dense coder that fits on a single consumer GPU."
    }
  },
  {
    "id": "voxtral-4b-tts",
    "name": "Voxtral-4B-TTS",
    "author": "Mistral AI",
    "origin": "fr",
    "params": 4,
    "family": "Mistral",
    "license": "CC-BY-NC 4.0",
    "tags": [
      "audio",
      "multilingual",
      "fr",
      "small"
    ],
    "ctx": 4096,
    "vram": {
      "q4": 10,
      "q5": 12,
      "q8": 18,
      "fp16": 33
    },
    "ramCpu": 12,
    "tokSec": {
      "low": 14,
      "mid": 40,
      "high": 100
    },
    "desc": "Mistral AI's open frontier TTS model covering 9 languages including French, rivaling ElevenLabs on quality. Note: CC-BY-NC 4.0, non-commercial only.",
    "best": [
      "Research and academic TTS projects",
      "Internal demos and prototypes",
      "Personal creative work and audiobooks",
      "Multilingual voice generation with French support",
      "Offline TTS on a laptop"
    ],
    "_en_extras": {
      "strengths": [
        "Studio-quality TTS in an open model",
        "Native French alongside 8 other languages",
        "Runs on consumer laptop hardware",
        "Competitive with ElevenLabs on quality"
      ],
      "weaknesses": [
        "CC-BY-NC 4.0 license blocks commercial use",
        "Not a text LLM, narrower utility",
        "Short 4K context limits long-form scripts"
      ],
      "verdict": "An ElevenLabs-class TTS for non-commercial work; commercial users need a different license path."
    }
  },
  {
    "id": "deepseek-r2-32b",
    "name": "DeepSeek R2 32B",
    "author": "DeepSeek",
    "origin": "cn",
    "params": 32,
    "family": "DeepSeek",
    "license": "MIT",
    "tags": [
      "reasoning"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 19,
      "q5": 23,
      "q8": 35,
      "fp16": 64
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 30
    },
    "desc": "DeepSeek's dense 32B reasoning model under MIT, scoring 92.7% on AIME. Fits on a single RTX 4090 in Q4 and is the best consumer-GPU reasoner available.",
    "best": [
      "Math, competition, and STEM reasoning",
      "Single-GPU production reasoning workloads",
      "Chain-of-thought research on consumer hardware",
      "Commercial deployments under MIT",
      "Replacing closed reasoning APIs on a 4090"
    ],
    "_en_extras": {
      "strengths": [
        "92.7% on AIME, frontier-level math reasoning",
        "Runs on a single RTX 4090 in Q4",
        "MIT license with full commercial rights",
        "Best consumer-GPU reasoner of its generation"
      ],
      "weaknesses": [
        "Verbose chain-of-thought inflates token costs",
        "Specialized for reasoning, less polished for chat",
        "Latency can spike on hard problems"
      ],
      "verdict": "The best open reasoning model that fits on a single consumer GPU."
    }
  },
  {
    "id": "deepseek-v32",
    "name": "DeepSeek V3.2",
    "author": "DeepSeek",
    "origin": "cn",
    "params": 685,
    "family": "DeepSeek",
    "license": "MIT",
    "tags": [
      "chat",
      "general",
      "moe"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 410,
      "q5": 490,
      "q8": 735,
      "fp16": 1370
    },
    "ramCpu": 512,
    "tokSec": {
      "low": 1,
      "mid": 5,
      "high": 15
    },
    "desc": "DeepSeek's 685B MoE featuring DeepSeek Sparse Attention for lower memory use. Holds an IMO gold-medal score and ranks #2 by volume on OpenRouter.",
    "best": [
      "Frontier-class generalist tasks on a multi-GPU server",
      "Competition-level math and reasoning",
      "Replacing closed APIs with MIT-licensed weights",
      "High-volume production inference",
      "Long-context enterprise workloads"
    ],
    "_en_extras": {
      "strengths": [
        "IMO gold-medal reasoning quality",
        "DeepSeek Sparse Attention reduces memory pressure",
        "MIT license",
        "#2 by usage volume on OpenRouter"
      ],
      "weaknesses": [
        "410GB+ in Q4 needs a serious multi-GPU server",
        "Sparse attention adds inference engine complexity",
        "Operational overhead is significant"
      ],
      "verdict": "A frontier-grade MIT-licensed MoE if you can run a multi-GPU cluster."
    }
  },
  {
    "id": "kimi-k25",
    "name": "Kimi K2.5",
    "author": "Moonshot AI",
    "origin": "cn",
    "params": 1000,
    "family": "Kimi",
    "license": "Modified MIT",
    "tags": [
      "chat",
      "general",
      "moe"
    ],
    "ctx": 256000,
    "vram": {
      "q4": 600,
      "q5": 720,
      "q8": 1080,
      "fp16": 2000
    },
    "ramCpu": 700,
    "tokSec": {
      "low": 1,
      "mid": 4,
      "high": 12
    },
    "desc": "Moonshot AI's 1-trillion-parameter MoE with 32B active parameters and a multimodal agent-swarm mode. Around 595GB on disk, aimed at serious home labs and small clusters.",
    "best": [
      "Multi-agent orchestration with swarm-mode coordination",
      "Frontier-scale local inference on a home lab cluster",
      "Long-context multimodal workflows up to 256K tokens",
      "Research into trillion-parameter models",
      "Replacing closed APIs at the high end"
    ],
    "_en_extras": {
      "strengths": [
        "Genuine 1-trillion-parameter open-weight model",
        "Built-in agent swarm coordination mode",
        "256K context with multimodal input",
        "Only 32B active parameters per token"
      ],
      "weaknesses": [
        "~600GB in Q4 demands a small cluster",
        "Modified MIT license needs legal review for commercial use",
        "Operational complexity is extreme",
        "Power and cooling budget rules out most home setups"
      ],
      "verdict": "The largest practical open-weight model in 2026, for teams that can host it."
    }
  },
  {
    "id": "nemotron-3-super-120b",
    "name": "Nemotron 3 Super 120B",
    "author": "NVIDIA",
    "origin": "us",
    "params": 120,
    "family": "Nemotron",
    "license": "NVIDIA Open Model License",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "moe"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 72,
      "q5": 86,
      "q8": 132,
      "fp16": 240
    },
    "ramCpu": 100,
    "tokSec": {
      "low": 2,
      "mid": 10,
      "high": 25
    },
    "desc": "NVIDIA's first frontier-class release, a 120B MoE with 12B active parameters scoring 60% on SWE-Bench Verified. Ships with the 10T-token training corpus.",
    "best": [
      "Enterprise deployments needing NVIDIA's commercial license",
      "SWE-Bench-grade coding agents on a multi-GPU rig",
      "Long-context analysis up to 128K tokens",
      "Reproducible research using the released training data",
      "Replacing closed APIs with NVIDIA-backed weights"
    ],
    "_en_extras": {
      "strengths": [
        "NVIDIA's first true frontier open release",
        "60% on SWE-Bench Verified",
        "Commercially permissive NVIDIA Open Model License",
        "10T-token training corpus released alongside weights"
      ],
      "weaknesses": [
        "72GB+ in Q4 needs serious hardware",
        "Ollama support is still partial",
        "License is permissive but not Apache 2.0"
      ],
      "verdict": "A credible NVIDIA-backed frontier model with the rare bonus of a public training corpus."
    }
  },
  {
    "id": "olmo3-7b",
    "name": "OLMo 3 7B",
    "author": "Allen AI",
    "origin": "us",
    "params": 7,
    "family": "OLMo",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 14
    },
    "ramCpu": 8,
    "tokSec": {
      "low": 12,
      "mid": 35,
      "high": 90
    },
    "desc": "Allen AI's fully open 7B model releasing weights, training data, and code under Apache 2.0. The reference choice for reproducible LLM research.",
    "best": [
      "Academic and reproducibility-focused research",
      "Auditing training data for compliance or bias",
      "Teaching LLM internals end-to-end",
      "Apache-licensed commercial baselines",
      "Regulatory environments demanding full traceability"
    ],
    "_en_extras": {
      "strengths": [
        "Weights, data, and code all Apache 2.0",
        "Full traceability from corpus to checkpoint",
        "Backed by Allen AI's research credibility"
      ],
      "weaknesses": [
        "Quality trails the best closed-data 7B models",
        "8K context is restrictive for modern workloads",
        "Not tuned for top leaderboard scores"
      ],
      "verdict": "The clearest choice when full training transparency matters more than peak benchmark scores."
    }
  },
  {
    "id": "olmo3-32b",
    "name": "OLMo 3 32B",
    "author": "Allen AI",
    "origin": "us",
    "params": 32,
    "family": "OLMo",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "reasoning"
    ],
    "ctx": 65536,
    "vram": {
      "q4": 19,
      "q5": 23,
      "q8": 35,
      "fp16": 64
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 30
    },
    "desc": "Allen AI's fully open dense 32B with Think and Instruct variants, releasing weights, data, and code under Apache 2.0. The transparency benchmark for 32B-class models.",
    "best": [
      "Regulated industries that must audit training data",
      "Academic and reproducibility research at scale",
      "EU AI Act compliance requiring full traceability",
      "Apache-licensed commercial deployments",
      "Choosing between toggleable Think and Instruct modes"
    ],
    "_en_extras": {
      "strengths": [
        "Complete training transparency at 32B scale",
        "Apache 2.0 across weights, data, and code",
        "Think and Instruct variants for different workloads",
        "Strongest auditable model for AI Act compliance"
      ],
      "weaknesses": [
        "Benchmarks trail closed-data 32B models",
        "64K context lags top competitors",
        "Less polished than commercial-tuned alternatives"
      ],
      "verdict": "The most transparent 32B available; pick it when auditability outweighs raw benchmark scores."
    }
  },
  {
    "id": "tiny-aya-3b",
    "name": "Tiny Aya 3.35B",
    "author": "Cohere For AI",
    "origin": "ca",
    "params": 3.35,
    "family": "Aya",
    "license": "CC-BY-NC 4.0",
    "tags": [
      "chat",
      "multilingual",
      "small"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 2.2,
      "q5": 2.7,
      "q8": 3.8,
      "fp16": 7
    },
    "ramCpu": 5,
    "tokSec": {
      "low": 25,
      "mid": 75,
      "high": 170
    },
    "desc": "Cohere For AI's 3.35B model in 5 regional variants covering 70+ languages, with the Water variant tuned for Europe and APAC. CC-BY-NC 4.0, non-commercial only.",
    "best": [
      "Research on multilingual small models",
      "Internal multilingual tooling under non-commercial use",
      "Region-specific assistants via specialized variants",
      "Educational and academic projects",
      "Personal multilingual chat applications"
    ],
    "_en_extras": {
      "strengths": [
        "Best multilingual quality in the tiny tier",
        "Five regional variants targeting specific markets",
        "Native coverage of 70+ languages",
        "Backed by Cohere For AI research"
      ],
      "weaknesses": [
        "CC-BY-NC 4.0 blocks commercial deployment",
        "8K context is limiting for long-form work",
        "Quality varies across regional variants"
      ],
      "verdict": "The strongest tiny multilingual model when commercial use is off the table."
    }
  },
  {
    "id": "granite4-3b-vision",
    "name": "Granite 4.0 3B Vision",
    "author": "IBM",
    "origin": "us",
    "params": 3,
    "family": "Granite",
    "license": "Apache 2.0",
    "tags": [
      "vision",
      "chat",
      "small"
    ],
    "ctx": 16384,
    "vram": {
      "q4": 2.2,
      "q5": 2.7,
      "q8": 3.8,
      "fp16": 6.5
    },
    "ramCpu": 5,
    "tokSec": {
      "low": 25,
      "mid": 75,
      "high": 170
    },
    "desc": "IBM's 3B vision-language model purpose-built for enterprise document extraction, including OCR, table parsing, and form understanding. Apache 2.0 and laptop-deployable.",
    "best": [
      "Enterprise document and form extraction pipelines",
      "OCR replacement for invoices, receipts, and PDFs",
      "Table structure understanding at scale",
      "Apache-licensed on-prem document AI",
      "Edge deployment for sensitive enterprise data"
    ],
    "_en_extras": {
      "strengths": [
        "Fast, accurate enterprise OCR",
        "Strong table and form-field extraction",
        "Apache 2.0 license",
        "Runs comfortably on a laptop"
      ],
      "weaknesses": [
        "16K context limits multi-page documents",
        "English-first, weak on non-Latin scripts",
        "Narrow scope, not a general-purpose VLM"
      ],
      "verdict": "The best small VLM for enterprise document workflows under an Apache license."
    }
  },
  {
    "id": "step-35-flash",
    "name": "Step 3.5 Flash",
    "author": "StepFun",
    "origin": "cn",
    "params": 196,
    "family": "Step",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "moe"
    ],
    "ctx": 256000,
    "vram": {
      "q4": 118,
      "q5": 141,
      "q8": 210,
      "fp16": 392
    },
    "ramCpu": 140,
    "tokSec": {
      "low": 2,
      "mid": 10,
      "high": 25
    },
    "desc": "StepFun's 196B MoE with 11B active parameters delivers 100 tokens/sec at 128K context. Ranks #3 by free-tier volume on OpenRouter under Apache 2.0.",
    "best": [
      "High-throughput chat backends",
      "Long-context workloads needing fast inference",
      "Apache-licensed commercial deployments",
      "Cost-sensitive production at scale",
      "Workloads where latency matters more than top quality"
    ],
    "_en_extras": {
      "strengths": [
        "100 tokens/sec sustained at 128K context",
        "256K maximum context window",
        "Only 11B active parameters",
        "Apache 2.0 license"
      ],
      "weaknesses": [
        "118GB+ in Q4 needs a multi-GPU server",
        "Brand awareness still low outside Asia",
        "Trails top open models on hardest benchmarks"
      ],
      "verdict": "A fast, permissively licensed MoE that punches well above its name recognition."
    }
  },
  {
    "id": "falcon-h1r-7b",
    "name": "Falcon H1R 7B",
    "author": "TII",
    "origin": "ae",
    "params": 7,
    "family": "Falcon",
    "license": "TII Falcon-LLM License 2.0",
    "tags": [
      "reasoning"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 14
    },
    "ramCpu": 8,
    "tokSec": {
      "low": 12,
      "mid": 35,
      "high": 90
    },
    "desc": "TII's 7B hybrid reasoning architecture that outperforms models seven times its size on key benchmarks. Compact and energy-efficient.",
    "best": [
      "Reasoning workloads on constrained hardware",
      "Energy-sensitive deployments",
      "Research on hybrid reasoning architectures",
      "Edge inference where larger reasoners won't fit",
      "Cost-optimized reasoning APIs"
    ],
    "_en_extras": {
      "strengths": [
        "Outperforms models 7x its size on reasoning",
        "Compact 7B footprint",
        "Strong energy efficiency",
        "Novel hybrid architecture"
      ],
      "weaknesses": [
        "TII Falcon-LLM License 2.0 needs clause-by-clause review",
        "32K context is modest for 2026",
        "Hybrid architecture means uneven tooling support"
      ],
      "verdict": "An impressive small reasoner if its specific license terms fit your use case."
    }
  },
  {
    "id": "mixtral-8x22b",
    "name": "Mixtral 8x22B Instruct",
    "author": "Mistral AI",
    "origin": "fr",
    "params": 141,
    "family": "Mistral",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "moe",
      "multilingual",
      "fr"
    ],
    "ctx": 64000,
    "vram": {
      "q4": 82,
      "q5": 100,
      "q8": 150,
      "fp16": 282
    },
    "ramCpu": 120,
    "tokSec": {
      "low": 2,
      "mid": 8,
      "high": 22
    },
    "desc": "Mistral AI's mature 141B/39B-active MoE under Apache 2.0, scoring 77.8 on MMLU and 45.1 on HumanEval. A proven general-purpose workhorse at roughly 80GB in Q4.",
    "best": [
      "Stable, well-understood production deployments",
      "Apache-licensed commercial products",
      "Multilingual general chat including French",
      "Workloads where reliability beats latest benchmarks",
      "Teams with existing Mixtral infrastructure"
    ],
    "_en_extras": {
      "strengths": [
        "Battle-tested mature MoE",
        "Strong general-purpose performance",
        "Apache 2.0 license",
        "Solid multilingual coverage"
      ],
      "weaknesses": [
        "80GB in Q4 still demands serious hardware",
        "Coding trails newer specialists",
        "64K context lags 2026 competitors",
        "Outclassed by newer Mistral releases on most benchmarks"
      ],
      "verdict": "Still a dependable Apache-licensed generalist, but newer Mistral models now beat it across the board."
    }
  },
  {
    "id": "mistral-small-32-24b",
    "name": "Mistral Small 3.2 24B",
    "author": "Mistral AI",
    "origin": "fr",
    "params": 24,
    "family": "Mistral",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "vision",
      "multilingual",
      "fr"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 14,
      "q5": 17,
      "q8": 26,
      "fp16": 48
    },
    "ramCpu": 24,
    "tokSec": {
      "low": 4,
      "mid": 15,
      "high": 40
    },
    "desc": "Mistral AI's June 2025 refresh of Small 3.1: a 24B Apache 2.0 dense model with vision input, sharper function calling, and roughly half the rate of runaway generations seen in 3.1.",
    "best": [
      "Self-hosted multilingual chat assistant on a single 24GB GPU",
      "Agentic workflows that need reliable tool calls without paying for a frontier model",
      "OCR and document Q&A pipelines combining text and screenshots",
      "European deployments needing strong French, German, and Spanish coverage",
      "Drop-in upgrade for existing Small 3.1 deployments"
    ],
    "_en_extras": {
      "strengths": [
        "Roughly 50% fewer infinite-generation loops than 3.1",
        "Notably improved function calling and structured output reliability",
        "Vision encoder included for multimodal tasks",
        "Apache 2.0 — unrestricted commercial use",
        "Fits comfortably on a single 24GB consumer GPU at Q4"
      ],
      "weaknesses": [
        "Requires a recent Ollama build for full chat-template support",
        "Still trails frontier models on hard reasoning benchmarks",
        "Vision quality lags dedicated VLMs like Qwen2.5-VL"
      ],
      "verdict": "The pragmatic choice for self-hosted multilingual chat and tool-using agents on a single GPU — and a no-brainer upgrade from Small 3.1."
    }
  },
  {
    "id": "codestral-22b",
    "name": "Codestral 22B v0.1",
    "author": "Mistral AI",
    "origin": "fr",
    "params": 22,
    "family": "Mistral",
    "license": "Mistral Non-Production License",
    "tags": [
      "code",
      "fr"
    ],
    "ctx": 32000,
    "vram": {
      "q4": 13,
      "q5": 16,
      "q8": 24,
      "fp16": 44
    },
    "ramCpu": 22,
    "tokSec": {
      "low": 4,
      "mid": 16,
      "high": 42
    },
    "desc": "Mistral AI's 22B code specialist covering 80+ programming languages, with strong HumanEval and MBPP scores. Locked behind the restrictive MNPL license — personal and research use only.",
    "best": [
      "Personal coding assistant on a workstation with 16–24GB VRAM",
      "Academic research on code generation and completion",
      "Internal experimentation before committing to a commercial license",
      "Polyglot codebases where coverage across 80+ languages matters"
    ],
    "_en_extras": {
      "strengths": [
        "HumanEval 81.1 and MBPP 78.2 — competitive with much larger models at release",
        "Broad language coverage including niche languages",
        "32k context handles most repo files comfortably",
        "Strong fill-in-the-middle completion"
      ],
      "weaknesses": [
        "MNPL license blocks all production and commercial use",
        "Outclassed by Qwen 2.5 Coder 14B for permissive-licensed alternatives",
        "32k context is tight for large-repo agents"
      ],
      "verdict": "Capable code model held back by its non-production license — for anything you'd ship, pick Qwen 2.5 Coder 14B instead."
    }
  },
  {
    "id": "codestral-mamba-7b",
    "name": "Codestral Mamba 7B",
    "author": "Mistral AI",
    "origin": "fr",
    "params": 7,
    "family": "Mistral",
    "license": "Apache 2.0",
    "tags": [
      "code",
      "fr"
    ],
    "ctx": 256000,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 14
    },
    "ramCpu": 8,
    "tokSec": {
      "low": 15,
      "mid": 40,
      "high": 100
    },
    "desc": "Mistral AI's pure Mamba SSM architecture for code, with linear-time inference and a 256k context window. Apache 2.0, but tooling support is still patchy.",
    "best": [
      "Long-context code analysis across entire repositories",
      "Research into state-space models for code",
      "Inference workloads where constant memory matters more than raw quality",
      "Settings where mistral-inference or vLLM is already in the stack"
    ],
    "_en_extras": {
      "strengths": [
        "Verified 256k context for whole-repo reasoning",
        "Constant memory footprint regardless of sequence length",
        "Apache 2.0 license",
        "Linear-time inference scales gracefully on long inputs"
      ],
      "weaknesses": [
        "No official Ollama support",
        "Only partial llama.cpp integration",
        "Requires mistral-inference or vLLM for full functionality",
        "Quality trails transformer-based coders of similar size"
      ],
      "verdict": "The first serious Mamba code model — pick it for long-context experiments, not for daily completion work."
    }
  },
  {
    "id": "magistral-small-24b",
    "name": "Magistral Small 24B",
    "author": "Mistral AI",
    "origin": "fr",
    "params": 24,
    "family": "Mistral",
    "license": "Apache 2.0",
    "tags": [
      "reasoning",
      "fr"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 14,
      "q5": 17,
      "q8": 26,
      "fp16": 48
    },
    "ramCpu": 24,
    "tokSec": {
      "low": 4,
      "mid": 15,
      "high": 40
    },
    "desc": "Mistral AI's first open reasoning model, built on Small 3.1 with RL-trained chain-of-thought. Hits 70.7% on AIME24 under Apache 2.0.",
    "best": [
      "Math, science, and competition-style problem solving on local hardware",
      "Transparent reasoning where visible CoT helps debugging",
      "Reasoning workloads requiring a permissively licensed alternative to DeepSeek R1",
      "Multi-step planning agents with reasoning budgets under 40k tokens"
    ],
    "_en_extras": {
      "strengths": [
        "First open Mistral reasoner with a real RL training pipeline",
        "AIME24 70.7% — competitive with much larger reasoners",
        "Apache 2.0 license",
        "Runs on a single 24GB GPU at Q4"
      ],
      "weaknesses": [
        "Highly verbose in thinking mode — token costs add up",
        "Recommended effective context capped around 40k",
        "Trails DeepSeek R1 distills on hardest math benchmarks"
      ],
      "verdict": "Mistral's first credible reasoning model — solid math chops under Apache 2.0, if you can stomach the verbose CoT."
    }
  },
  {
    "id": "mistral-large-3",
    "name": "Mistral Large 3 675B",
    "author": "Mistral AI",
    "origin": "fr",
    "params": 675,
    "family": "Mistral",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "vision",
      "multilingual",
      "fr",
      "moe"
    ],
    "ctx": 256000,
    "vram": {
      "q4": 405,
      "q5": 485,
      "q8": 720,
      "fp16": 1350
    },
    "ramCpu": 480,
    "tokSec": {
      "low": 1,
      "mid": 5,
      "high": 15
    },
    "desc": "Mistral AI's flagship 675B MoE (41B active) with a 2.5B vision encoder, trained from scratch on 3,000 H200s and released under Apache 2.0. Currently #2 OSS non-reasoning model on LMArena.",
    "best": [
      "Frontier-tier on-prem deployments needing permissive licensing",
      "Multimodal applications requiring top-tier text quality",
      "Sovereign or regulated environments that cannot ship data to closed APIs",
      "Multilingual production workloads across European languages",
      "Replacing GPT-4-class APIs in self-hosted stacks"
    ],
    "_en_extras": {
      "strengths": [
        "Top-tier open weights — #2 OSS non-reasoning on LMArena",
        "Apache 2.0 — fully unrestricted commercial use",
        "Native multimodal with 2.5B vision encoder",
        "256k context window",
        "Strong multilingual coverage out of the box"
      ],
      "weaknesses": [
        "405GB at Q4 — needs an H200 or B200 server class deployment",
        "Active expert count (41B) still demands substantial inference compute",
        "Overkill for most single-GPU or developer-laptop use cases"
      ],
      "verdict": "The most capable open-weight non-reasoning model shipping today — if you have the H200s, this replaces closed frontier APIs."
    }
  },
  {
    "id": "mistral-medium-35",
    "name": "Mistral Medium 3.5 128B",
    "author": "Mistral AI",
    "origin": "fr",
    "params": 128,
    "family": "Mistral",
    "license": "Modified MIT",
    "tags": [
      "chat",
      "general",
      "code",
      "reasoning",
      "vision",
      "multilingual",
      "fr"
    ],
    "ctx": 256000,
    "vram": {
      "q4": 74,
      "q5": 91,
      "q8": 137,
      "fp16": 256
    },
    "ramCpu": 160,
    "tokSec": {
      "low": 1,
      "mid": 4,
      "high": 12
    },
    "desc": "Mistral AI's first merged flagship — a dense 128B with vision, 256k context, and configurable reasoning. Hits 77.6% on SWE-Bench Verified, consolidating Medium 3.1, Magistral, and Devstral 2 into one model.",
    "best": [
      "Agentic coding workflows demanding state-of-the-art SWE-Bench performance",
      "Customer-support automation needing top τ³-Telecom scores",
      "Long-document analysis up to 256k tokens",
      "Multilingual vision tasks across 24 languages",
      "Production deployments wanting reasoning toggleable per request"
    ],
    "_en_extras": {
      "strengths": [
        "SWE-Bench Verified 77.6% — best-in-class for open weights",
        "τ³-Telecom 91.4% for tool-using agents",
        "256k context with strong long-context retention",
        "Vision-enabled and multilingual across 24 languages",
        "Modified MIT — permissive for most commercial use"
      ],
      "weaknesses": [
        "~74GB at Q4 — needs a 4-GPU box for comfortable serving",
        "Revenue clause kicks in for large enterprises",
        "Single-model consolidation means no separate specialized variants"
      ],
      "verdict": "The first Mistral flagship that bundles coding, reasoning, and vision into one model — and it's competitive on every axis."
    }
  },
  {
    "id": "llama-3-1-405b",
    "name": "Llama 3.1 405B Instruct",
    "author": "Meta",
    "origin": "us",
    "params": 405,
    "family": "Llama",
    "license": "Llama 3.1 Community",
    "tags": [
      "chat",
      "general",
      "reasoning"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 240,
      "q5": 288,
      "q8": 435,
      "fp16": 810
    },
    "ramCpu": 320,
    "tokSec": {
      "low": 0.5,
      "mid": 2,
      "high": 8
    },
    "desc": "Meta's reference open dense model at 405B parameters, with MMLU 88.6 and HumanEval 89.0. Gated on Hugging Face and over 240GB even at Q4.",
    "best": [
      "Self-hosted alternative to closed frontier APIs when you have the hardware",
      "Reproducible research baseline for large dense models",
      "Long-running batch inference where weight licensing matters more than speed",
      "Distillation source for smaller specialist models"
    ],
    "_en_extras": {
      "strengths": [
        "The reference dense open model — widely benchmarked and well-understood",
        "MMLU 88.6, HumanEval 89.0",
        "128k context",
        "Mature ecosystem support across all serving frameworks"
      ],
      "weaknesses": [
        "240+ GB at Q4 — needs a serious multi-GPU server",
        "Hugging Face gated access",
        "Llama 3.1 Community License with MAU clause",
        "Largely superseded by MoE alternatives at similar quality"
      ],
      "verdict": "Still the canonical dense open model, but MoE alternatives now deliver comparable quality at a fraction of the inference cost."
    }
  },
  {
    "id": "llama-4-scout",
    "name": "Llama 4 Scout 109B",
    "author": "Meta",
    "origin": "us",
    "params": 109,
    "family": "Llama",
    "license": "Llama 4 Community",
    "tags": [
      "chat",
      "general",
      "vision",
      "moe",
      "multilingual"
    ],
    "ctx": 10000000,
    "vram": {
      "q4": 65,
      "q5": 78,
      "q8": 117,
      "fp16": 218
    },
    "ramCpu": 100,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 30
    },
    "desc": "Meta's compact Llama 4 MoE — 109B total, 17B active, natively multimodal, with an unprecedented 10M token context. Fits on a single H100.",
    "best": [
      "Whole-codebase or whole-corpus analysis up to 10M tokens",
      "Multimodal pipelines where one H100 is the inference budget",
      "Long-form document understanding without RAG",
      "Multilingual chat with native image input"
    ],
    "_en_extras": {
      "strengths": [
        "10M token context — unmatched among open models",
        "Runs on a single H100 thanks to MoE sparsity",
        "Native multimodal input — no separate vision adapter needed",
        "17B active parameters keeps inference fast"
      ],
      "weaknesses": [
        "Hugging Face gated access",
        "Llama 4 Community License with the >700M MAU clause",
        "Long-context quality drops well before the 10M ceiling",
        "Newer than Llama 3.1 — tooling still catching up"
      ],
      "verdict": "The long-context champion of open weights — if you actually need 10M tokens, nothing else comes close on a single H100."
    }
  },
  {
    "id": "llama-4-maverick",
    "name": "Llama 4 Maverick 400B",
    "author": "Meta",
    "origin": "us",
    "params": 400,
    "family": "Llama",
    "license": "Llama 4 Community",
    "tags": [
      "chat",
      "general",
      "vision",
      "moe",
      "multilingual"
    ],
    "ctx": 1000000,
    "vram": {
      "q4": 240,
      "q5": 285,
      "q8": 425,
      "fp16": 800
    },
    "ramCpu": 280,
    "tokSec": {
      "low": 2,
      "mid": 8,
      "high": 22
    },
    "desc": "Meta's larger Llama 4 MoE at 400B total with 17B active across 128 experts, natively multimodal. LMArena 1417 and 1M token context, but 245GB to download.",
    "best": [
      "Frontier-quality open chat in multi-GPU production",
      "Multimodal agents needing 1M context",
      "Drop-in for teams ready to commit to the Llama 4 ecosystem",
      "Workloads where MMLU-Pro 80 quality justifies the storage cost"
    ],
    "_en_extras": {
      "strengths": [
        "LMArena 1417 — top-tier open chat quality",
        "MMLU-Pro 80",
        "1M token context",
        "Native multimodal with strong vision performance",
        "17B active keeps inference cost manageable"
      ],
      "weaknesses": [
        "245GB download — non-trivial storage and bandwidth",
        "Hugging Face gated access",
        "Llama 4 Community License with >700M MAU clause",
        "Outclassed on reasoning by R1-class models"
      ],
      "verdict": "Meta's biggest open chat model and a credible GPT-4-class alternative — if you can host 245GB and accept the MAU clause."
    }
  },
  {
    "id": "llama31-nemotron-70b",
    "name": "Llama 3.1 Nemotron 70B",
    "author": "NVIDIA",
    "origin": "us",
    "params": 70,
    "family": "Nemotron",
    "license": "Llama 3.1 Community",
    "tags": [
      "chat",
      "general",
      "reasoning"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 40,
      "q5": 48,
      "q8": 75,
      "fp16": 140
    },
    "ramCpu": 64,
    "tokSec": {
      "low": 1,
      "mid": 6,
      "high": 20
    },
    "desc": "NVIDIA's RLHF tune of Llama 3.1 70B that topped Arena Hard at 85.0 at release. Strong alignment and instruction-following on familiar Llama foundations.",
    "best": [
      "Instruction-heavy chat assistants needing strong alignment",
      "Deployments already standardized on the Llama 3.1 family",
      "Workloads where human-preference alignment beats raw benchmarks",
      "NVIDIA-stack deployments leveraging NIM and TensorRT-LLM"
    ],
    "_en_extras": {
      "strengths": [
        "Arena Hard 85.0 — topped the leaderboard at release",
        "AlpacaEval 2 LC 57.6",
        "MT-Bench 8.98",
        "Strong RLHF on real human preference data"
      ],
      "weaknesses": [
        "Llama 3.1 Community License with MAU clause",
        "Hugging Face gated access",
        "Now overtaken on reasoning by Qwen 2.5 72B and R1 distills",
        "~42GB at Q4 — needs dual 24GB GPUs"
      ],
      "verdict": "An excellent RLHF tune of Llama 3.1 70B — still strong for alignment-heavy chat, though reasoning specialists have since pulled ahead."
    }
  },
  {
    "id": "qwen25-3b",
    "name": "Qwen 2.5 3B Instruct",
    "author": "Alibaba",
    "origin": "cn",
    "params": 3,
    "family": "Qwen",
    "license": "Qwen Research License",
    "tags": [
      "chat",
      "general",
      "multilingual",
      "small"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 2,
      "q5": 2.5,
      "q8": 4,
      "fp16": 6
    },
    "ramCpu": 5,
    "tokSec": {
      "low": 25,
      "mid": 70,
      "high": 160
    },
    "desc": "Alibaba's compact 3B Qwen 2.5 instruct model with surprisingly strong MMLU 65.6 and HumanEval 74.4. Locked to non-commercial use under the Qwen Research License.",
    "best": [
      "Edge and on-device inference where 2GB VRAM is the budget",
      "Multilingual prototypes before scaling up to a larger model",
      "Research and personal projects under the Qwen Research License",
      "Latency-critical paths where larger models are too slow"
    ],
    "_en_extras": {
      "strengths": [
        "Around 2GB VRAM at Q4 — runs on almost anything",
        "Multilingual coverage rare at this size",
        "MMLU 65.6 and HumanEval 74.4 punch above its weight",
        "32k context out of the box"
      ],
      "weaknesses": [
        "Qwen Research License blocks commercial use",
        "Quality gap vs 7B-and-up is meaningful for non-trivial tasks",
        "32k context limits long-document work"
      ],
      "verdict": "A strong 3B for research and edge prototyping, but the Qwen Research License rules it out of production."
    }
  },
  {
    "id": "qwen25-14b",
    "name": "Qwen 2.5 14B Instruct",
    "author": "Alibaba",
    "origin": "cn",
    "params": 14,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "multilingual"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 9,
      "q5": 11,
      "q8": 16,
      "fp16": 28
    },
    "ramCpu": 16,
    "tokSec": {
      "low": 6,
      "mid": 20,
      "high": 55
    },
    "desc": "Alibaba's Apache 2.0 dense 14B hitting MMLU 79.7 and HumanEval 83.5 across 29+ languages. The pragmatic sweet spot for self-hosted general-purpose chat.",
    "best": [
      "General-purpose chat on a single 16–24GB GPU",
      "Multilingual production workloads needing a permissive license",
      "RAG pipelines balancing quality and inference cost",
      "Replacing 7B models that hit a quality ceiling"
    ],
    "_en_extras": {
      "strengths": [
        "Apache 2.0 — fully commercial-friendly",
        "MMLU 79.7 and HumanEval 83.5 at 14B scale",
        "Excellent VRAM-to-quality ratio",
        "131k context via YaRN extension"
      ],
      "weaknesses": [
        "Native context is 32k — 131k requires YaRN configuration",
        "Outscored on hard reasoning by 30B+ alternatives",
        "Vision not included — pick Qwen2.5-VL if you need it"
      ],
      "verdict": "The default Apache 2.0 dense model for self-hosted general chat — solid quality at a price most teams can run."
    }
  },
  {
    "id": "qwen25-72b",
    "name": "Qwen 2.5 72B Instruct",
    "author": "Alibaba",
    "origin": "cn",
    "params": 72,
    "family": "Qwen",
    "license": "Qwen License",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "multilingual"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 42,
      "q5": 50,
      "q8": 78,
      "fp16": 144
    },
    "ramCpu": 64,
    "tokSec": {
      "low": 1,
      "mid": 5,
      "high": 18
    },
    "desc": "Alibaba's flagship Qwen 2.5 dense at 72B, with MMLU 86.1 and HumanEval 86.6. Strong across the board but under the custom Qwen License with a 100M MAU threshold.",
    "best": [
      "Top-tier dense chat under 100M MAU",
      "Math-heavy workloads needing MATH 83.1",
      "Code generation where HumanEval 86.6 matters",
      "Multi-GPU deployments wanting near-frontier quality"
    ],
    "_en_extras": {
      "strengths": [
        "MMLU 86.1 — close to much larger models",
        "HumanEval 86.6 strong for a general-purpose model",
        "MATH 83.1",
        "131k context with solid long-context behavior"
      ],
      "weaknesses": [
        "Custom Qwen License with the 100M MAU clause",
        "~42GB at Q4 — dual-GPU territory",
        "Slower than MoE alternatives like Qwen 3 30B-A3B for similar quality"
      ],
      "verdict": "The strongest open dense 72B you can self-host — just check the license before scaling past 100M MAU."
    }
  },
  {
    "id": "qwen25-coder-15b",
    "name": "Qwen 2.5 Coder 1.5B Instruct",
    "author": "Alibaba",
    "origin": "cn",
    "params": 1.5,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "code",
      "small"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 1,
      "q5": 1.2,
      "q8": 2,
      "fp16": 3
    },
    "ramCpu": 3,
    "tokSec": {
      "low": 50,
      "mid": 150,
      "high": 300
    },
    "desc": "Alibaba's smallest Qwen 2.5 Coder at 1.5B parameters under Apache 2.0, covering 92 programming languages. HumanEval 70.7 makes it a serious on-device completion model.",
    "best": [
      "Local inline completion in IDE plugins",
      "Edge devices and laptops without dedicated GPUs",
      "Latency-critical code suggestions where 7B is too slow",
      "Fallback model when bigger coders are unavailable"
    ],
    "_en_extras": {
      "strengths": [
        "Around 1GB VRAM at Q4 — runs nearly anywhere",
        "Strong inline completion for a 1.5B model",
        "Apache 2.0 license",
        "92 programming languages covered"
      ],
      "weaknesses": [
        "1.5B caps code quality — not for complex generation",
        "32k context only",
        "Outclassed on harder tasks by 7B+ coders"
      ],
      "verdict": "An impressively capable 1.5B coder — keep it for on-device completion, not for whole-feature generation."
    }
  },
  {
    "id": "qwen25-coder-3b",
    "name": "Qwen 2.5 Coder 3B Instruct",
    "author": "Alibaba",
    "origin": "cn",
    "params": 3,
    "family": "Qwen",
    "license": "Qwen Research License",
    "tags": [
      "code",
      "small"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 2,
      "q5": 2.5,
      "q8": 4,
      "fp16": 6
    },
    "ramCpu": 5,
    "tokSec": {
      "low": 25,
      "mid": 70,
      "high": 160
    },
    "desc": "Alibaba's 3B Qwen 2.5 Coder hitting HumanEval 84.1, covering 92 programming languages. Restricted to non-commercial use under the Qwen Research License.",
    "best": [
      "Personal coding assistant on 4–8GB VRAM",
      "Academic research on small code models",
      "On-device completion where the 1.5B isn't strong enough",
      "Side projects under the Qwen Research License"
    ],
    "_en_extras": {
      "strengths": [
        "HumanEval 84.1 — exceptional for 3B",
        "Around 2GB VRAM at Q4",
        "Fast inline completion",
        "92 programming languages"
      ],
      "weaknesses": [
        "Qwen Research License blocks commercial use",
        "32k context only",
        "Trails the 7B Coder on complex multi-file tasks"
      ],
      "verdict": "Best-in-class 3B coder benchmark-wise, but the non-commercial license pushes most teams to the Apache 2.0 1.5B or 14B variants."
    }
  },
  {
    "id": "qwen25-coder-14b",
    "name": "Qwen 2.5 Coder 14B Instruct",
    "author": "Alibaba",
    "origin": "cn",
    "params": 14,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "code"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 9,
      "q5": 11,
      "q8": 16,
      "fp16": 28
    },
    "ramCpu": 16,
    "tokSec": {
      "low": 6,
      "mid": 20,
      "high": 55
    },
    "desc": "Alibaba's Qwen 2.5 Coder 14B under Apache 2.0 with HumanEval 89.6 and LiveCodeBench 37.1. The VRAM sweet spot for serious self-hosted code generation.",
    "best": [
      "Self-hosted coding agents on a single 24GB GPU",
      "Repo-scale code generation needing 131k context",
      "Permissively licensed alternative to Codestral",
      "Multi-language production codebases"
    ],
    "_en_extras": {
      "strengths": [
        "HumanEval 89.6 — competitive with much larger coders",
        "LiveCodeBench 37.1",
        "Apache 2.0 license",
        "131k context for long-file work"
      ],
      "weaknesses": [
        "Weaker than general 14B models on non-code chat",
        "No vision input",
        "Outscored by frontier closed APIs on the hardest benchmarks"
      ],
      "verdict": "The pragmatic Apache 2.0 coder — strong benchmarks, 24GB VRAM, and no licensing landmines."
    }
  },
  {
    "id": "qwen3-30b-a3b",
    "name": "Qwen 3 30B-A3B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 30,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "multilingual",
      "moe"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 19,
      "q5": 23,
      "q8": 35,
      "fp16": 62
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 15,
      "mid": 40,
      "high": 100
    },
    "desc": "Alibaba's Qwen 3 MoE with 30B total and just 3B active parameters, supporting hybrid thinking mode. MMLU 81.4, AIME24 80.4, 100+ languages, Apache 2.0.",
    "best": [
      "Fast self-hosted chat that toggles into reasoning when needed",
      "Multilingual production across 100+ languages",
      "Workloads needing reasoning quality without the verbosity of dedicated reasoners",
      "Single 24GB GPU deployments wanting MoE inference speed"
    ],
    "_en_extras": {
      "strengths": [
        "3B active parameters keeps inference fast and cheap",
        "MMLU 81.4 and AIME24 80.4 — strong on both general and reasoning",
        "Apache 2.0",
        "Hybrid thinking toggle per request",
        "100+ language coverage"
      ],
      "weaknesses": [
        "~19GB at Q4 — slightly tight on 16GB cards",
        "Thinking mode adds latency and token cost",
        "MoE routing complicates some fine-tuning workflows"
      ],
      "verdict": "The most pragmatic Apache 2.0 model on the market — MoE speed, reasoning on demand, and one of the strongest 24GB-class options."
    }
  },
  {
    "id": "deepseek-r1-distill-qwen-15b",
    "name": "DeepSeek R1 Distill Qwen 1.5B",
    "author": "DeepSeek",
    "origin": "cn",
    "params": 1.5,
    "family": "DeepSeek",
    "license": "MIT",
    "tags": [
      "reasoning",
      "small"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 1,
      "q5": 1.2,
      "q8": 2,
      "fp16": 3
    },
    "ramCpu": 3,
    "tokSec": {
      "low": 50,
      "mid": 150,
      "high": 300
    },
    "desc": "DeepSeek's R1 reasoning distilled into a 1.5B MIT-licensed model with visible chain-of-thought. Hits MATH-500 83.9 and runs on any laptop.",
    "best": [
      "Teaching and demos showing CoT reasoning on minimal hardware",
      "Math tutoring apps on edge devices",
      "Research baselines for distillation experiments",
      "Battery-constrained mobile deployments"
    ],
    "_en_extras": {
      "strengths": [
        "Around 1GB VRAM at Q4 — runs on any laptop",
        "Visible chain-of-thought reasoning",
        "MIT license — fully unrestricted",
        "128k context in a 1.5B model"
      ],
      "weaknesses": [
        "Reasoning depth is genuinely limited at 1.5B despite CoT",
        "Highly verbose — token costs add up fast",
        "Outclassed by the 14B distill on anything non-trivial"
      ],
      "verdict": "A fun MIT-licensed reasoning model that fits anywhere, but the 1.5B ceiling shows on real problems."
    }
  },
  {
    "id": "deepseek-r1-distill-qwen-14b",
    "name": "DeepSeek R1 Distill Qwen 14B",
    "author": "DeepSeek",
    "origin": "cn",
    "params": 14,
    "family": "DeepSeek",
    "license": "MIT",
    "tags": [
      "reasoning"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 9,
      "q5": 11,
      "q8": 16,
      "fp16": 28
    },
    "ramCpu": 16,
    "tokSec": {
      "low": 6,
      "mid": 20,
      "high": 55
    },
    "desc": "DeepSeek's R1 reasoning distilled into Qwen 14B under MIT. AIME24 69.7 and MATH-500 93.9 — beats o1-mini on most reasoning benchmarks.",
    "best": [
      "Math, coding, and STEM reasoning on a single 24GB GPU",
      "Local alternative to o1-mini-class APIs",
      "Workloads needing MIT-licensed reasoning",
      "Agentic planners that benefit from explicit CoT"
    ],
    "_en_extras": {
      "strengths": [
        "AIME24 69.7 and MATH-500 93.9",
        "Beats o1-mini on multiple reasoning benchmarks",
        "MIT license — no usage restrictions",
        "131k context"
      ],
      "weaknesses": [
        "Verbose CoT inflates token costs",
        "Slower than non-reasoning 14B for simple queries",
        "No vision or tool-use specialization"
      ],
      "verdict": "The best 14B reasoner on permissive license today — a serious local alternative to o1-mini for STEM workloads."
    }
  },
  {
    "id": "phi4-mini",
    "name": "Phi-4 Mini 3.8B",
    "author": "Microsoft",
    "origin": "us",
    "params": 3.8,
    "family": "Phi",
    "license": "MIT",
    "tags": [
      "chat",
      "general",
      "small"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 10,
      "q5": 12,
      "q8": 18,
      "fp16": 33
    },
    "ramCpu": 12,
    "tokSec": {
      "low": 14,
      "mid": 40,
      "high": 100
    },
    "desc": "Microsoft's 3.8B Phi-4 Mini under MIT with native function calling, 128k context via LongRoPE, and a 200k vocab. MMLU 67.3 and HumanEval 74.4.",
    "best": [
      "Tool-using agents on minimal hardware",
      "On-device assistants requiring function calls",
      "MIT-licensed embedded deployments",
      "Long-context document tasks in a small model"
    ],
    "_en_extras": {
      "strengths": [
        "Native function calling at 3.8B",
        "128k context via LongRoPE",
        "MIT license",
        "200k vocabulary improves tokenization efficiency"
      ],
      "weaknesses": [
        "English-first — multilingual coverage is thin",
        "Outscored on raw quality by Qwen 2.5 3B",
        "Tool-calling reliability still trails larger models"
      ],
      "verdict": "The MIT-licensed pick for small tool-using agents — strong function calling and 128k context in a 3.8B footprint."
    }
  },
  {
    "id": "phi4-mini-reasoning",
    "name": "Phi-4 Mini Reasoning 3.8B",
    "author": "Microsoft",
    "origin": "us",
    "params": 3.8,
    "family": "Phi",
    "license": "MIT",
    "tags": [
      "reasoning",
      "small"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 10,
      "q5": 12,
      "q8": 18,
      "fp16": 33
    },
    "ramCpu": 12,
    "tokSec": {
      "low": 14,
      "mid": 40,
      "high": 100
    },
    "desc": "Microsoft's 3.8B Phi-4 Mini variant trained on R1-style reasoning traces under MIT. AIME24 57.5 and MATH-500 94.6 — remarkable math chops for the size.",
    "best": [
      "Math and STEM reasoning on a laptop",
      "Educational tutoring apps under MIT",
      "Research into small-model reasoning distillation",
      "Battery-constrained reasoning workloads"
    ],
    "_en_extras": {
      "strengths": [
        "AIME24 57.5 — exceptional for 3.8B",
        "MATH-500 94.6 nearly matches frontier models",
        "Fits comfortably on any laptop",
        "MIT license"
      ],
      "weaknesses": [
        "English-first",
        "Verbose CoT typical of reasoning models",
        "Outside math, quality trails the base Phi-4 Mini"
      ],
      "verdict": "Pound-for-pound the most impressive small reasoner under MIT — pick it for math on the smallest hardware."
    }
  },
  {
    "id": "gemma3n-e2b",
    "name": "Gemma 3n E2B",
    "author": "Google",
    "origin": "us",
    "params": 2,
    "family": "Gemma",
    "license": "Gemma",
    "tags": [
      "chat",
      "general",
      "multilingual",
      "small"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 2,
      "q5": 2.5,
      "q8": 3.5,
      "fp16": 6
    },
    "ramCpu": 6,
    "tokSec": {
      "low": 35,
      "mid": 100,
      "high": 200
    },
    "desc": "Google's Gemma 3n with 2B effective parameters (6B raw) using MatFormer, covering 140+ languages. Optimized for mobile and edge; text-only on Ollama.",
    "best": [
      "Mobile and embedded deployments where memory is scarce",
      "Multilingual edge inference across 140+ languages",
      "Battery-constrained on-device chat",
      "MatFormer-based research and experimentation"
    ],
    "_en_extras": {
      "strengths": [
        "Built specifically for mobile and edge hardware",
        "140+ language coverage in a tiny footprint",
        "MatFormer architecture maximizes memory efficiency",
        "Per-layer shared embeddings cut RAM use"
      ],
      "weaknesses": [
        "32k context only",
        "Absolute quality trails Gemma 3 9B",
        "Gemma license — not as permissive as Apache 2.0",
        "Multimodal features not exposed via Ollama"
      ],
      "verdict": "Google's most memory-efficient small model — purpose-built for mobile and edge inference, with multilingual to match."
    }
  },
  {
    "id": "gemma3n-e4b",
    "name": "Gemma 3n E4B",
    "author": "Google",
    "origin": "us",
    "params": 4,
    "family": "Gemma",
    "license": "Gemma",
    "tags": [
      "chat",
      "general",
      "multilingual",
      "small"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 4.5,
      "q5": 5.5,
      "q8": 8,
      "fp16": 14
    },
    "ramCpu": 8,
    "tokSec": {
      "low": 22,
      "mid": 65,
      "high": 150
    },
    "desc": "Google's full Gemma 3n with 4B effective parameters (8B raw) and nested MatFormer architecture. Native multimodal across 140 languages for high-end mobile deployments.",
    "best": [
      "High-end mobile or edge devices needing multimodal input",
      "Multilingual on-device assistants across 140 languages",
      "Image-aware mobile workflows",
      "Replacing E2B when accuracy matters more than RAM"
    ],
    "_en_extras": {
      "strengths": [
        "4B effective parameters punch well above mobile-class weights",
        "Integrated multimodal — text and image input",
        "140 language coverage",
        "Open Gemma license"
      ],
      "weaknesses": [
        "32k context only",
        "Beaten by Gemma 3 12B in desktop scenarios",
        "Gemma license — less permissive than Apache 2.0",
        "Multimodal support uneven across runtimes"
      ],
      "verdict": "The full-fat Gemma 3n — strong mobile multimodal with surprising quality, if Gemma's license fits your use case."
    }
  },
  {
    "id": "granite32-8b",
    "name": "Granite 3.2 8B Instruct",
    "author": "IBM",
    "origin": "us",
    "params": 8,
    "family": "Granite",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 16
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 10,
      "mid": 30,
      "high": 80
    },
    "desc": "IBM's enterprise-focused 8B Granite 3.2 with a toggleable thinking mode under Apache 2.0. MMLU 65.5 and IFEval 70.9, with built-in IBM safety guardrails.",
    "best": [
      "Enterprise RAG deployments needing strict instruction following",
      "Regulated environments requiring safety guardrails out of the box",
      "Internal tools where Apache 2.0 plus IBM backing matters",
      "Workloads benefiting from optional thinking mode"
    ],
    "_en_extras": {
      "strengths": [
        "128k context",
        "Apache 2.0 license",
        "Strong RAG and enterprise instruction following",
        "IBM Safety Guardrails included",
        "Toggleable thinking mode"
      ],
      "weaknesses": [
        "Trails Llama 3.1 8B on general chat",
        "Very enterprise-flavored tone",
        "Weaker than Qwen 2.5 7B on coding tasks"
      ],
      "verdict": "The default open 8B for enterprise RAG and regulated workloads — picked for safety guardrails and IBM support, not chat quality."
    }
  },
  {
    "id": "granite33-8b",
    "name": "Granite 3.3 8B Instruct",
    "author": "IBM",
    "origin": "us",
    "params": 8,
    "family": "Granite",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "code"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 16
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 10,
      "mid": 30,
      "high": 80
    },
    "desc": "IBM's update to Granite 3.2 8B adding fill-in-the-middle code support and improved instruction following. Apache 2.0 with strong agent and tool-use behavior.",
    "best": [
      "Enterprise agents needing tool use and structured output",
      "RAG pipelines where instruction-following reliability matters",
      "Internal developer tooling combining code and chat",
      "Drop-in upgrade from Granite 3.2 8B"
    ],
    "_en_extras": {
      "strengths": [
        "128k context",
        "Apache 2.0 license",
        "Strong agentic and tool-use behavior",
        "Fill-in-the-middle code completion added",
        "Better instruction following than 3.2"
      ],
      "weaknesses": [
        "Still very enterprise-flavored",
        "Less versatile than Qwen 3 8B on open-ended chat",
        "Code quality trails dedicated coders like Qwen 2.5 Coder 7B"
      ],
      "verdict": "A clean upgrade over Granite 3.2 8B for enterprise agents — better tool use, better code, same Apache 2.0 backbone."
    }
  },
  {
    "id": "granite4-small",
    "name": "Granite 4.0 H-Small 32B-A9B",
    "author": "IBM",
    "origin": "us",
    "params": 32,
    "family": "Granite",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "moe"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 19,
      "q5": 23,
      "q8": 35,
      "fp16": 64
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 10,
      "mid": 30,
      "high": 75
    },
    "desc": "IBM's hybrid Mamba-2 + MoE model with 32B total and 9B active parameters, engineered to slash long-context memory use by roughly 70% versus comparable transformers under Apache 2.0.",
    "best": [
      "Long-document RAG pipelines where VRAM is the bottleneck",
      "Enterprise deployments needing a permissive Apache 2.0 license",
      "Self-hosted assistants handling 100k+ token transcripts",
      "Cost-sensitive inference at sustained high concurrency",
      "Workloads where you want MoE throughput without the H100-class footprint"
    ],
    "_en_extras": {
      "strengths": [
        "Hybrid Mamba-2 architecture cuts long-context memory by ~70%",
        "MoE design keeps active params at 9B for fast inference",
        "Apache 2.0 with no usage restrictions",
        "Built with enterprise governance and provenance in mind",
        "Strong throughput on commodity multi-GPU setups"
      ],
      "weaknesses": [
        "Requires a recent llama.cpp build for hybrid architecture support",
        "Tooling ecosystem still catching up to dense Llama-class models",
        "Quality trails frontier 30B+ dense models on hard reasoning"
      ],
      "verdict": "The most memory-efficient open MoE for long-context enterprise work — pick it when VRAM, license, and 128k context all matter."
    }
  },
  {
    "id": "granite4-tiny",
    "name": "Granite 4.0 H-Tiny 7B-A1B",
    "author": "IBM",
    "origin": "us",
    "params": 7,
    "family": "Granite",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "moe",
      "small"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 4,
      "q5": 5,
      "q8": 7,
      "fp16": 14
    },
    "ramCpu": 8,
    "tokSec": {
      "low": 60,
      "mid": 180,
      "high": 350
    },
    "desc": "IBM's edge-class hybrid MoE with 7B total and only 1B active parameters — Apache 2.0 licensed and built for embedded and low-cost serving.",
    "best": [
      "On-device assistants on laptops or edge boxes",
      "High-QPS endpoints where active-param cost dominates",
      "Long-context summarization on memory-constrained hardware",
      "Embedded products needing a clean commercial license",
      "Prototyping pipelines before scaling to Granite 4.0 Small"
    ],
    "_en_extras": {
      "strengths": [
        "Extremely low compute cost per token via 1B active params",
        "Apache 2.0 license with no commercial strings attached",
        "128k context handled efficiently thanks to hybrid Mamba-2",
        "Tiny memory footprint suits edge and serverless deploys"
      ],
      "weaknesses": [
        "Quality lags dense 3B models on some single-shot tasks",
        "Smaller active capacity hurts complex reasoning",
        "Needs current llama.cpp support to run efficiently"
      ],
      "verdict": "The most efficient Apache-licensed MoE for edge inference — the right pick when cost-per-token and license cleanliness trump raw quality."
    }
  },
  {
    "id": "tulu3-8b",
    "name": "Tülu 3 8B",
    "author": "Allen AI",
    "origin": "us",
    "params": 8,
    "family": "Tulu",
    "license": "Llama 3.1 Community",
    "tags": [
      "chat",
      "general"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 6,
      "q5": 7,
      "q8": 10,
      "fp16": 16
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 10,
      "mid": 30,
      "high": 80
    },
    "desc": "Allen AI's fully open post-training recipe applied to Llama 3.1 8B, hitting 87.6 on GSM8K with all data, code, and evals released publicly.",
    "best": [
      "Reproducible research on RLHF and DPO pipelines",
      "Drop-in replacement for Llama 3.1 8B Instruct with stronger math",
      "Instruction-following workloads needing high IFEval scores",
      "Teams that need to audit training data end-to-end",
      "Academic baselines requiring full provenance"
    ],
    "_en_extras": {
      "strengths": [
        "Best fully-open RLHF recipe shipped to date",
        "GSM8K 87.6 is class-leading at 8B",
        "IFEval 82.4 shows strong instruction adherence",
        "Training data, code, and evals all publicly available",
        "Stable behavior on standard chat benchmarks"
      ],
      "weaknesses": [
        "Inherits the Llama 3.1 Community License",
        "No native vision or tool-use specialization",
        "Eclipsed at the frontier by larger open models"
      ],
      "verdict": "The reference open RLHF recipe at 8B — choose it when reproducibility and post-training transparency matter as much as benchmark scores."
    }
  },
  {
    "id": "tulu3-70b",
    "name": "Tülu 3 70B",
    "author": "Allen AI",
    "origin": "us",
    "params": 70,
    "family": "Tulu",
    "license": "Llama 3.1 Community",
    "tags": [
      "chat",
      "general",
      "reasoning"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 40,
      "q5": 48,
      "q8": 75,
      "fp16": 140
    },
    "ramCpu": 64,
    "tokSec": {
      "low": 1,
      "mid": 6,
      "high": 20
    },
    "desc": "Allen AI's fully open RLHF stack on Llama 3.1 70B, beating Claude Haiku, GPT-3.5 Turbo, and GPT-4o-mini on standard reasoning and code benchmarks.",
    "best": [
      "Self-hosted alternative to closed mid-tier APIs",
      "Math-heavy chat with GSM8K 93.5 territory performance",
      "Code assistance where HumanEval+ matters more than agentic loops",
      "Research projects that need a fully documented post-training pipeline",
      "Workloads that justify a 2x A100 footprint"
    ],
    "_en_extras": {
      "strengths": [
        "Beats Claude Haiku, GPT-3.5 Turbo, and GPT-4o-mini on key evals",
        "GSM8K 93.5 and HumanEval+ 92.4 at open weights",
        "Fully open SFT + DPO + RLVR recipe",
        "Strong instruction following and refusal calibration",
        "Stable, well-documented behavior for production deploys"
      ],
      "weaknesses": [
        "~40 GB VRAM at Q4 — needs serious hardware",
        "Bound by Llama 3.1 Community License",
        "No multimodal capabilities"
      ],
      "verdict": "The strongest fully open post-trained 70B available — a credible self-hosted replacement for closed mid-tier chat APIs."
    }
  },
  {
    "id": "olmoe-1b-7b",
    "name": "OLMoE 1B-7B Instruct",
    "author": "Allen AI",
    "origin": "us",
    "params": 7,
    "family": "OLMo",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "moe",
      "small"
    ],
    "ctx": 4096,
    "vram": {
      "q4": 4,
      "q5": 5,
      "q8": 7,
      "fp16": 14
    },
    "ramCpu": 8,
    "tokSec": {
      "low": 50,
      "mid": 150,
      "high": 300
    },
    "desc": "Allen AI's OLMoE is the only MoE released with weights, training data, and code fully open — 7B total with 1.3B active, matching Llama2-13B-Chat quality.",
    "best": [
      "Research that requires fully reproducible MoE training",
      "Latency-critical chat where 1.3B active params win",
      "Teaching and curriculum use cases needing full provenance",
      "Cheap CPU or single-GPU inference setups",
      "Baselines for new MoE architectures"
    ],
    "_en_extras": {
      "strengths": [
        "Very fast inference with only 1.3B active parameters",
        "Training corpus is 100% open source (Dolmino + Pile 2)",
        "Apache 2.0 license throughout",
        "Competitive with Llama2-13B-Chat at a fraction of the cost"
      ],
      "weaknesses": [
        "4096-token context is limiting for modern workloads",
        "Quality trails recent dense 7B models",
        "Limited tooling and quantization support"
      ],
      "verdict": "The only truly open MoE end-to-end — pick it for research and education over raw production quality."
    }
  },
  {
    "id": "molmo-7b",
    "name": "Molmo 7B-D",
    "author": "Allen AI",
    "origin": "us",
    "params": 7,
    "family": "Molmo",
    "license": "Apache 2.0",
    "tags": [
      "vision",
      "chat"
    ],
    "ctx": 4096,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 16
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 10,
      "mid": 30,
      "high": 80
    },
    "desc": "Allen AI's Apache-licensed VLM built on Qwen2-7B and CLIP, scoring between GPT-4V and GPT-4o on benchmarks with unique pointing and grounding capabilities.",
    "best": [
      "UI automation needing pixel-accurate pointing",
      "Visual grounding research with permissive licensing",
      "Image annotation pipelines requiring open data provenance",
      "Robotics and accessibility tools that need spatial references",
      "Replacing GPT-4V in workflows that demand on-prem deployment"
    ],
    "_en_extras": {
      "strengths": [
        "Pointing capability is rare in open VLMs",
        "Apache 2.0 across weights and PixMo training data",
        "Performance lands between GPT-4V and GPT-4o on standard benchmarks",
        "Transparent human-annotated training set"
      ],
      "weaknesses": [
        "4096-token context cap limits multi-turn vision chats",
        "OCR quality trails Qwen2-VL 7B",
        "Smaller community ecosystem than mainstream VLMs"
      ],
      "verdict": "The open VLM to choose when you need pointing and grounding under a clean commercial license."
    }
  },
  {
    "id": "molmo-72b",
    "name": "Molmo 72B",
    "author": "Allen AI",
    "origin": "us",
    "params": 72,
    "family": "Molmo",
    "license": "Apache 2.0",
    "tags": [
      "vision",
      "chat"
    ],
    "ctx": 4096,
    "vram": {
      "q4": 42,
      "q5": 50,
      "q8": 78,
      "fp16": 144
    },
    "ramCpu": 64,
    "tokSec": {
      "low": 1,
      "mid": 5,
      "high": 18
    },
    "desc": "Allen AI's flagship Apache 2.0 VLM built on Qwen2-72B, ranked #2 in human evaluation behind only GPT-4o for visual understanding.",
    "best": [
      "On-prem replacement for GPT-4o vision in regulated environments",
      "High-stakes visual analysis where quality dominates cost",
      "Research benchmarks demanding open weights at frontier quality",
      "Document and diagram understanding at scale",
      "Multi-GPU deployments already provisioned for 70B-class models"
    ],
    "_en_extras": {
      "strengths": [
        "Top-tier vision quality among open-weight VLMs",
        "Apache 2.0 license with PixMo open training data",
        "Strong on complex visual reasoning and dense scenes",
        "Human evaluation second only to GPT-4o"
      ],
      "weaknesses": [
        "~42 GB VRAM at Q4 typically requires 2-3 GPUs",
        "4096-token context constrains long multimodal sessions",
        "No official GGUF release complicates llama.cpp use"
      ],
      "verdict": "The highest-quality fully open VLM — choose it when you have the GPUs and need GPT-4o-class vision on-prem."
    }
  },
  {
    "id": "smollm3-3b",
    "name": "SmolLM3 3B",
    "author": "HuggingFace",
    "origin": "fr",
    "params": 3,
    "family": "SmolLM",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "small"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 2,
      "q5": 2.5,
      "q8": 4,
      "fp16": 6
    },
    "ramCpu": 5,
    "tokSec": {
      "low": 25,
      "mid": 70,
      "high": 160
    },
    "desc": "HuggingFace's 3B model with dual think/no-think modes, 128k context, and full open data and recipe — punching at MMLU 59.7 and GSM8K 70.9.",
    "best": [
      "Edge devices and laptops needing real reasoning at 3B",
      "Long-context tasks where larger models aren't viable",
      "Multilingual chat across the six supported European languages",
      "Educational and research use needing fully open training",
      "Latency-sensitive applications wanting toggleable thinking"
    ],
    "_en_extras": {
      "strengths": [
        "Compact think mode delivers reasoning at 3B scale",
        "Native support for six languages",
        "Apache 2.0 with fully open training data and recipe",
        "128k context unusual at this size",
        "Strong MMLU and GSM8K for the parameter count"
      ],
      "weaknesses": [
        "No official Ollama distribution — needs manual setup",
        "Quality ceiling typical of 3B dense models on hard tasks",
        "Smaller community than competing 3B releases"
      ],
      "verdict": "The reasoning-capable 3B to beat — ideal for edge deployments that still need think-mode and 128k context."
    }
  },
  {
    "id": "minicpm-v-26",
    "name": "MiniCPM-V 2.6 8B",
    "author": "OpenBMB",
    "origin": "cn",
    "params": 8,
    "family": "MiniCPM",
    "license": "MiniCPM Model License",
    "tags": [
      "vision",
      "chat"
    ],
    "ctx": 32000,
    "vram": {
      "q4": 5.5,
      "q5": 7,
      "q8": 10,
      "fp16": 18
    },
    "ramCpu": 12,
    "tokSec": {
      "low": 10,
      "mid": 30,
      "high": 80
    },
    "desc": "OpenBMB's 8B vision-language model pairing SigLIP and Qwen2, scoring 65.2 on OpenCompass and beating GPT-4o on OCRBench among sub-25B models.",
    "best": [
      "OCR and document extraction at high resolution",
      "Multi-image and video understanding on a single GPU",
      "VLM workloads needing 32k context",
      "Replacing GPT-4V for screenshot and form parsing",
      "Mobile and consumer-grade inference of multimodal apps"
    ],
    "_en_extras": {
      "strengths": [
        "Beats GPT-4o on OCRBench in the sub-25B class",
        "OpenCompass 65.2 matches much larger VLMs",
        "Handles 1.8MP inputs without aggressive downsampling",
        "Native multi-image and video reasoning",
        "Free aspect-ratio handling avoids letterboxing artifacts"
      ],
      "weaknesses": [
        "MiniCPM Model License requires registration for commercial use",
        "Smaller community than Qwen2-VL or Llama-class VLMs",
        "Tooling support varies across inference backends"
      ],
      "verdict": "The OCR champion among compact open VLMs — the right call when document fidelity beats pure chat quality."
    }
  },
  {
    "id": "minicpm-o-26",
    "name": "MiniCPM-o 2.6 8B",
    "author": "OpenBMB",
    "origin": "cn",
    "params": 8,
    "family": "MiniCPM",
    "license": "MiniCPM Model License",
    "tags": [
      "vision",
      "audio",
      "chat"
    ],
    "ctx": 32000,
    "vram": {
      "q4": 5.5,
      "q5": 7,
      "q8": 10,
      "fp16": 18
    },
    "ramCpu": 12,
    "tokSec": {
      "low": 10,
      "mid": 30,
      "high": 80
    },
    "desc": "OpenBMB's omni-modal 8B model adding audio and full-duplex speech streaming on top of vision, scoring 70.2 on OpenCompass and beating GPT-4o on single-image tasks.",
    "best": [
      "Voice assistants needing on-prem omni-modal capability",
      "Real-time speech-to-speech demos and prototypes",
      "Multimodal chat combining vision, audio, and text in one model",
      "Replacing GPT-4o omni for privacy-sensitive deployments",
      "Streaming applications that benefit from full-duplex inference"
    ],
    "_en_extras": {
      "strengths": [
        "End-to-end full-duplex speech streaming",
        "OpenCompass 70.2 across vision-language tasks",
        "Beats GPT-4o on single-image evaluations",
        "Unified omni-modal architecture in 8B"
      ],
      "weaknesses": [
        "Ollama integration is image-only — audio needs native inference",
        "Speech and audio paths require the official runtime",
        "Same MiniCPM license registration requirements"
      ],
      "verdict": "The closest open answer to GPT-4o omni — pick it when you need streaming voice and vision in a single self-hosted 8B model."
    }
  },
  {
    "id": "falcon3-7b",
    "name": "Falcon 3 7B Instruct",
    "author": "TII",
    "origin": "ae",
    "params": 7,
    "family": "Falcon",
    "license": "TII Falcon-LLM License 2.0",
    "tags": [
      "chat",
      "general",
      "multilingual"
    ],
    "ctx": 32000,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 14
    },
    "ramCpu": 8,
    "tokSec": {
      "low": 12,
      "mid": 35,
      "high": 90
    },
    "desc": "TII's 7B trained on 14T tokens, hitting MMLU 70.5 — on par with Qwen2.5-7B — with native support for English, French, Spanish, German, and Portuguese.",
    "best": [
      "Multilingual chat across the five supported European languages",
      "General-purpose 7B serving where Qwen licensing is a concern",
      "Workloads needing 32k context at small scale",
      "Sovereign deployments preferring a non-Chinese-origin model",
      "Knowledge-heavy QA at the 7B tier"
    ],
    "_en_extras": {
      "strengths": [
        "MMLU 70.5 matches Qwen2.5-7B",
        "Trained on 14T tokens for broad knowledge coverage",
        "Five-language native support out of the box",
        "Permissive commercial license under TII Falcon-LLM 2.0",
        "32k context covers most production needs"
      ],
      "weaknesses": [
        "TII license is permissive but not Apache 2.0",
        "Smaller community than Llama or Qwen ecosystems",
        "No official multimodal variants"
      ],
      "verdict": "A credible non-Chinese 7B with Qwen-class quality — pick it for European multilingual work that needs a permissive commercial license."
    }
  },
  {
    "id": "falcon3-10b",
    "name": "Falcon 3 10B Instruct",
    "author": "TII",
    "origin": "ae",
    "params": 10,
    "family": "Falcon",
    "license": "TII Falcon-LLM License 2.0",
    "tags": [
      "chat",
      "general",
      "multilingual"
    ],
    "ctx": 32000,
    "vram": {
      "q4": 6,
      "q5": 8,
      "q8": 12,
      "fp16": 20
    },
    "ramCpu": 12,
    "tokSec": {
      "low": 9,
      "mid": 28,
      "high": 70
    },
    "desc": "TII's depth-upscaled 10B successor to Falcon 3 7B, hitting MMLU 73.1 and GSM8K 83.1 — state-of-the-art under 13B at release.",
    "best": [
      "General chat where 7B is too weak and 13B too costly",
      "Multilingual production deploys across five EU languages",
      "Math-leaning tasks needing GSM8K 83+ at small scale",
      "Replacing Llama 3 8B with stronger benchmark numbers",
      "Workloads benefiting from 32k context"
    ],
    "_en_extras": {
      "strengths": [
        "SOTA among sub-13B models at release",
        "MMLU 73.1 with strong knowledge breadth",
        "Efficient depth-upscaled design from the 7B base",
        "Five-language coverage with permissive licensing",
        "Strong GSM8K performance for the size class"
      ],
      "weaknesses": [
        "TII Falcon-LLM 2.0 license, not Apache 2.0",
        "Limited fine-tune ecosystem versus Llama derivatives",
        "No multimodal version available"
      ],
      "verdict": "The strongest sub-13B Falcon to date — a solid mid-size pick when you need multilingual quality without the Llama license."
    }
  },
  {
    "id": "falcon-mamba-7b",
    "name": "Falcon Mamba 7B",
    "author": "TII",
    "origin": "ae",
    "params": 7,
    "family": "Falcon",
    "license": "TII Falcon-LLM License 2.0",
    "tags": [
      "chat",
      "general"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 14
    },
    "ramCpu": 8,
    "tokSec": {
      "low": 15,
      "mid": 40,
      "high": 100
    },
    "desc": "TII's first serious pure Mamba SSM at scale — 7B with constant memory per token, sidestepping transformer attention costs entirely.",
    "best": [
      "Streaming workloads needing constant memory per token",
      "Research on state-space models versus transformers",
      "Throughput-bound inference where attention is the bottleneck",
      "Long-running generation where context grows unboundedly",
      "Edge inference on memory-constrained devices"
    ],
    "_en_extras": {
      "strengths": [
        "O(1) memory per token at inference",
        "No practical context limit imposed by attention",
        "Apache 2.0 license",
        "Demonstrates Mamba viability at production scale"
      ],
      "weaknesses": [
        "Weaker in-context learning than transformers of equal size",
        "No vision or multimodal support",
        "Trained context is only 8k despite architectural headroom"
      ],
      "verdict": "The benchmark pure-Mamba 7B — pick it to study SSMs or to serve streaming workloads where attention costs hurt most."
    }
  },
  {
    "id": "command-r-35b",
    "name": "Command R 35B v01",
    "author": "Cohere",
    "origin": "ca",
    "params": 35,
    "family": "Command",
    "license": "CC-BY-NC 4.0",
    "tags": [
      "chat",
      "general",
      "multilingual"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 20,
      "q5": 25,
      "q8": 37,
      "fp16": 70
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 28
    },
    "desc": "Cohere's original Command R, a 35B optimized for RAG and tool use across 10 languages with 128k context — but locked under CC-BY-NC for non-commercial use only.",
    "best": [
      "Research projects exploring early open RAG-native models",
      "Internal evaluations and prototyping with no commercial intent",
      "Tool-use experiments needing 128k context",
      "Multilingual RAG benchmarking across 10 languages",
      "Comparisons against successor Command R+ 104B"
    ],
    "_en_extras": {
      "strengths": [
        "First open model designed natively for RAG and tool use",
        "128k context for long retrieval pipelines",
        "10 evaluated languages, 23 in pretraining",
        "Strong citation and grounding behavior"
      ],
      "weaknesses": [
        "CC-BY-NC 4.0 license blocks commercial deployment",
        "Superseded by Command R+ 104B for production quality",
        "No multimodal capabilities"
      ],
      "verdict": "Historically important but commercially off-limits — choose it only for research, and reach for Command R+ everywhere else."
    }
  },
  {
    "id": "aya-23-8b",
    "name": "Aya 23 8B",
    "author": "Cohere For AI",
    "origin": "ca",
    "params": 8,
    "family": "Aya",
    "license": "CC-BY-NC 4.0",
    "tags": [
      "chat",
      "general",
      "multilingual"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 16
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 10,
      "mid": 30,
      "high": 80
    },
    "desc": "Cohere For AI's pre-Expanse 8B multilingual model covering 23 languages, now superseded by Aya Expanse 8B and restricted to non-commercial use.",
    "best": [
      "Reproducing pre-Expanse multilingual baselines",
      "Research comparisons of Cohere multilingual lineage",
      "Non-commercial multilingual prototypes",
      "Teaching examples for multilingual instruction tuning"
    ],
    "_en_extras": {
      "strengths": [
        "Solid pre-Expanse multilingual coverage",
        "23 languages including French, Arabic, and Chinese",
        "Compact 8B footprint for the breadth"
      ],
      "weaknesses": [
        "CC-BY-NC 4.0 license blocks commercial use",
        "8k context is restrictive by current standards",
        "Superseded by Aya Expanse 8B in every dimension"
      ],
      "verdict": "Skip for new work — Aya Expanse 8B replaces this model with better quality under the same license."
    }
  },
  {
    "id": "aya-23-35b",
    "name": "Aya 23 35B",
    "author": "Cohere For AI",
    "origin": "ca",
    "params": 35,
    "family": "Aya",
    "license": "CC-BY-NC 4.0",
    "tags": [
      "chat",
      "general",
      "multilingual"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 20,
      "q5": 25,
      "q8": 37,
      "fp16": 70
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 28
    },
    "desc": "Cohere For AI's 35B pre-Expanse multilingual model on the Command base, covering 23 languages with strong instruction following — but locked to non-commercial use.",
    "best": [
      "Research baselines for multilingual instruction following",
      "Non-commercial multilingual chat in low-resource languages",
      "Comparisons against Aya Expanse 32B successor",
      "Academic evaluation across 23 languages"
    ],
    "_en_extras": {
      "strengths": [
        "Strong native quality across 23 languages",
        "Good instruction following in non-English settings",
        "Backed by Cohere's Command base architecture",
        "Competitive multilingual coverage for its era"
      ],
      "weaknesses": [
        "CC-BY-NC 4.0 license blocks commercial deployment",
        "~20 GB VRAM at Q4 with only 8k context",
        "Reasoning capabilities lag 2025-class open models"
      ],
      "verdict": "A strong pre-Expanse multilingual 35B — useful for research, but Aya Expanse and modern peers have moved past it."
    }
  },
  {
    "id": "yi-15-34b",
    "name": "Yi 1.5 34B Chat",
    "author": "01.AI",
    "origin": "cn",
    "params": 34,
    "family": "Yi",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "multilingual"
    ],
    "ctx": 4096,
    "vram": {
      "q4": 20,
      "q5": 24,
      "q8": 36,
      "fp16": 68
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 30
    },
    "desc": "01.AI's dense 34B chat model under Apache 2.0, trained on 3.6T tokens with strong English-Chinese bilingual quality.",
    "best": [
      "Chinese-English bilingual chat needing open weights",
      "Llama-compatible tooling pipelines at the 34B scale",
      "Research baselines from the 2024 dense-34B era",
      "Workloads where Apache 2.0 is mandatory at 34B",
      "Use cases where Qwen 2.5 32B isn't an option"
    ],
    "_en_extras": {
      "strengths": [
        "Excellent Chinese-language performance",
        "Compatible with Llama tooling and quantization",
        "Apache 2.0 license enables free commercial use",
        "Stable chat behavior and well-understood quirks"
      ],
      "weaknesses": [
        "4096-token context is severely limiting today",
        "Outclassed by Qwen 2.5 32B in 2025",
        "No multimodal or tool-use specialization"
      ],
      "verdict": "A competent Apache-licensed bilingual 34B from 2024 — only pick it over Qwen 2.5 32B when license terms force your hand."
    }
  },
  {
    "id": "yi-coder-9b",
    "name": "Yi Coder 9B Chat",
    "author": "01.AI",
    "origin": "cn",
    "params": 9,
    "family": "Yi",
    "license": "Apache 2.0",
    "tags": [
      "code"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 5.5,
      "q5": 7,
      "q8": 10,
      "fp16": 18
    },
    "ramCpu": 12,
    "tokSec": {
      "low": 9,
      "mid": 28,
      "high": 75
    },
    "desc": "01.AI's 9B code model covering 52 programming languages, hitting 23% on LiveCodeBench — best-in-class under 10B and beating DeepSeek Coder 33B.",
    "best": [
      "Code completion and review on a single consumer GPU",
      "Polyglot codebases spanning many languages",
      "Self-hosted Copilot alternatives at small scale",
      "Code workloads needing 128k context for repo-level reasoning",
      "Cost-sensitive deployments where Qwen Coder is overkill"
    ],
    "_en_extras": {
      "strengths": [
        "LiveCodeBench 23% leads the sub-10B field",
        "Outperforms DeepSeek Coder 33B at a fraction of the size",
        "Coverage across 52 programming languages",
        "Apache 2.0 license",
        "128k context enables repo-scale code understanding"
      ],
      "weaknesses": [
        "Less popular than Qwen Coder, so fewer fine-tunes exist",
        "No instruction-tuned variant beyond chat",
        "Quality gap versus Qwen 2.5 Coder 7B/14B in 2025"
      ],
      "verdict": "The strongest sub-10B code model in its release window — still a sharp pick when you need 128k context on modest hardware."
    }
  },
  {
    "id": "dbrx-instruct",
    "name": "DBRX Instruct",
    "author": "Databricks",
    "origin": "us",
    "params": 132,
    "family": "DBRX",
    "license": "Databricks Open Model License",
    "tags": [
      "chat",
      "general",
      "moe"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 76,
      "q5": 94,
      "q8": 140,
      "fp16": 264
    },
    "ramCpu": 112,
    "tokSec": {
      "low": 2,
      "mid": 8,
      "high": 22
    },
    "desc": "Databricks' 132B MoE with 36B active params, trained on 12T tokens — state-of-the-art at March 2024 release but largely surpassed by DeepSeek V3 and R1.",
    "best": [
      "Databricks-native pipelines that need an in-house model",
      "Code and math workloads where 36B active params shine",
      "Research comparisons against modern frontier MoEs",
      "Multi-GPU deployments already provisioned for 100B+ models",
      "Internal evals before migrating to DeepSeek V3 or R1"
    ],
    "_en_extras": {
      "strengths": [
        "State-of-the-art quality at March 2024 release",
        "Strong on code and math benchmarks",
        "Databricks Open Model License is broadly permissive",
        "12T tokens of high-quality training data"
      ],
      "weaknesses": [
        "~76 GB VRAM at Q4 demands multi-GPU serving",
        "Largely outclassed by DeepSeek V3 and R1 in 2025",
        "HuggingFace repo is gated, slowing access"
      ],
      "verdict": "Historically important but no longer competitive — only choose it inside Databricks pipelines where the integration justifies the cost."
    }
  },
  {
    "id": "jais-30b",
    "name": "Jais 30B Chat v3",
    "author": "MBZUAI / Core42",
    "origin": "ae",
    "params": 30,
    "family": "Jais",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "multilingual"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 18,
      "q5": 22,
      "q8": 33,
      "fp16": 60
    },
    "ramCpu": 28,
    "tokSec": {
      "low": 3,
      "mid": 13,
      "high": 32
    },
    "desc": "MBZUAI and Core42's reference open Arabic LLM — a 30B trained natively (not a fine-tune) for Arabic with strong English bilingual support under Apache 2.0.",
    "best": [
      "Arabic-first chat and content generation",
      "Bilingual Arabic-English customer support workloads",
      "MENA-region sovereign deployments",
      "Research on non-Latin-script LLM training",
      "Apache-licensed Arabic alternatives to closed APIs"
    ],
    "_en_extras": {
      "strengths": [
        "Native Arabic-first architecture, not a Llama fine-tune",
        "Strong bilingual Arabic-English performance",
        "Apache 2.0 license enables commercial use",
        "Backed by MBZUAI, Core42, and Cerebras"
      ],
      "weaknesses": [
        "8k context limits long-document workflows",
        "No official Ollama distribution",
        "Weaker than Jais Adapted 70B for hardest Arabic tasks"
      ],
      "verdict": "The default open Arabic LLM at the 30B class — pick it for native Arabic quality without the Llama license."
    }
  },
  {
    "id": "jais-70b",
    "name": "Jais Adapted 70B Chat",
    "author": "MBZUAI / Core42",
    "origin": "ae",
    "params": 70,
    "family": "Jais",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "multilingual"
    ],
    "ctx": 4096,
    "vram": {
      "q4": 40,
      "q5": 48,
      "q8": 75,
      "fp16": 140
    },
    "ramCpu": 64,
    "tokSec": {
      "low": 1,
      "mid": 6,
      "high": 20
    },
    "desc": "MBZUAI and Core42's Llama-2 70B extended with 32k Arabic tokens and GQA — the strongest open-weight Arabic LLM, reaching GPT-4-class quality in Arabic.",
    "best": [
      "Production Arabic workloads needing top open quality",
      "Arabic legal, medical, or technical content generation",
      "Bilingual Arabic-English assistants at enterprise scale",
      "MENA sovereign deployments with multi-GPU budgets",
      "Replacing GPT-4 for Arabic-heavy regulated use cases"
    ],
    "_en_extras": {
      "strengths": [
        "Strongest open-weight Arabic model available",
        "GPT-4-level performance in Arabic",
        "Jais license permits commercial use",
        "GQA improves inference efficiency at 70B scale"
      ],
      "weaknesses": [
        "~40 GB VRAM at Q4",
        "4096-token context is restrictive for long documents",
        "Limited capability outside Arabic and English"
      ],
      "verdict": "The clear top pick for Arabic at 70B — choose it when GPT-4-grade Arabic must run on your own hardware."
    }
  },
  {
    "id": "sarvam-m-24b",
    "name": "Sarvam-M 24B",
    "author": "Sarvam AI",
    "origin": "in",
    "params": 24,
    "family": "Sarvam",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "multilingual"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 14,
      "q5": 17,
      "q8": 26,
      "fp16": 48
    },
    "ramCpu": 24,
    "tokSec": {
      "low": 4,
      "mid": 15,
      "high": 40
    },
    "desc": "Sarvam AI's 24B built on Mistral Small 3.1 with hybrid think/no-think modes, gaining +86% on romanized GSM-8K Indic and covering 11 Indian languages plus English.",
    "best": [
      "Indic-language chat and content across 11 Indian languages",
      "Math-heavy workloads in romanized Indic scripts",
      "Hybrid reasoning where toggleable thinking helps",
      "Sovereign Indian deployments needing open weights",
      "Replacing closed APIs for Indian-market products"
    ],
    "_en_extras": {
      "strengths": [
        "+86% gain on romanized Indic GSM-8K",
        "Hybrid think/no-think mode toggle",
        "11 Indian languages plus English",
        "Apache 2.0 with permissive commercial use",
        "Mistral Small 3.1 base brings solid general quality"
      ],
      "weaknesses": [
        "No official Ollama distribution yet",
        "Strong Indic focus limits broader multilingual use",
        "Smaller community ecosystem than Mistral mainline"
      ],
      "verdict": "The top open model for Indic markets — pick it when you need real Indian-language coverage with hybrid reasoning."
    }
  },
  {
    "id": "salamandra-7b",
    "name": "Salamandra 7B Instruct",
    "author": "BSC",
    "origin": "es",
    "params": 7.7,
    "family": "Salamandra",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "multilingual",
      "fr"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 16
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 10,
      "mid": 30,
      "high": 80
    },
    "desc": "Barcelona Supercomputing Center's 7.8B trained on 7.8T tokens covering 35 European languages and 92 programming languages — built for EU sovereignty under Apache 2.0.",
    "best": [
      "EU-sovereign chat deployments",
      "Multilingual workloads spanning all 35 EU languages",
      "Code assistance across an unusually broad language set",
      "Public-sector projects requiring open European provenance",
      "Catalan, Occitan, and other low-resource Romance language use cases"
    ],
    "_en_extras": {
      "strengths": [
        "Backed by EU sovereignty and BSC infrastructure",
        "35 European languages natively supported",
        "Apache 2.0 license",
        "Coverage of 92 programming languages",
        "7.8T tokens of training data"
      ],
      "weaknesses": [
        "8k context falls short for long-document use",
        "No official Ollama distribution",
        "Quality trails frontier 7B models on English benchmarks"
      ],
      "verdict": "The reference EU-sovereign 7B — choose it when European language breadth and provenance matter more than top-tier English benchmarks."
    }
  },
  {
    "id": "salamandra-40b",
    "name": "Salamandra 40B Instruct",
    "author": "BSC",
    "origin": "es",
    "params": 40,
    "family": "Salamandra",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "multilingual",
      "fr"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 24,
      "q5": 29,
      "q8": 43,
      "fp16": 80
    },
    "ramCpu": 40,
    "tokSec": {
      "low": 2,
      "mid": 10,
      "high": 25
    },
    "desc": "BSC's 40B scaled-up Salamandra covering 35 EU languages with native Catalan support — though the HuggingFace repo is gated and successor ALIA-40B is now available.",
    "best": [
      "EU-sovereign workloads needing 40B-class quality",
      "Romance-language content generation, especially Catalan",
      "Public-sector and regulated deployments in Europe",
      "Multilingual research baselines across 35 EU languages",
      "Workflows already provisioned for ALIA-40B comparisons"
    ],
    "_en_extras": {
      "strengths": [
        "Sovereign European model purpose-built for Romance languages",
        "Unique native Catalan capability among open models",
        "Apache 2.0 license",
        "7.68T tokens with strong Iberian-language coverage"
      ],
      "weaknesses": [
        "~24 GB VRAM at Q4",
        "8192-token context limits modern long-context use",
        "Limited fine-tune ecosystem and gated repo access"
      ],
      "verdict": "The strongest open model for Catalan and Iberian Romance languages — but check ALIA-40B first if you can run either."
    }
  },
  {
    "id": "eurollm-22b",
    "name": "EuroLLM 22B Instruct 2512",
    "author": "Utter Project",
    "origin": "eu",
    "params": 22.6,
    "family": "EuroLLM",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "multilingual",
      "fr"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 13,
      "q5": 16,
      "q8": 24,
      "fp16": 45
    },
    "ramCpu": 24,
    "tokSec": {
      "low": 4,
      "mid": 16,
      "high": 40
    },
    "desc": "Utter Project's 22.6B EU-sovereign model released February 2026 covering 35 European languages with 32k context — the heavy-duty successor to EuroLLM 9B.",
    "best": [
      "EU-sovereign workloads needing more than EuroLLM 9B can deliver",
      "Multilingual production chat across 35 European languages",
      "Long-context European-language workflows up to 32k",
      "Public-sector deployments requiring open weights and EU provenance",
      "Migration target for teams running the 9B variant in production"
    ],
    "_en_extras": {
      "strengths": [
        "22B scale gives meaningful headroom over EuroLLM 9B",
        "Apache 2.0 license",
        "32k context handles document-length European-language workloads",
        "EU sovereignty across the full project stack",
        "Fresh February 2026 release with current training data"
      ],
      "weaknesses": [
        "No official Ollama distribution at launch",
        "Smaller community than mainline open models",
        "Tooling and quantization support still maturing"
      ],
      "verdict": "The new heavyweight EuroLLM — choose it when you've outgrown the 9B and need EU-sovereign multilingual quality at production scale."
    }
  },
  {
    "id": "claire-7b",
    "name": "Claire 7B 0.1",
    "author": "LINAGORA",
    "origin": "fr",
    "params": 7,
    "family": "Claire",
    "license": "CC-BY-NC-SA 4.0",
    "tags": [
      "chat",
      "fr"
    ],
    "ctx": 2048,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 14
    },
    "ramCpu": 8,
    "tokSec": {
      "low": 12,
      "mid": 35,
      "high": 90
    },
    "desc": "LINAGORA's LoRA fine-tune of Falcon-7B specialized for spontaneous French dialogue. Released under CC-BY-NC-SA 4.0, with a separate Apache-licensed variant available for commercial work.",
    "best": [
      "Research projects focused on conversational French",
      "Non-commercial prototypes needing native-feeling French dialogue",
      "Academic studies of spoken-style language modeling",
      "Baseline comparisons against modern French-capable LLMs"
    ],
    "_en_extras": {
      "strengths": [
        "Natural, spoken-style French output",
        "Lightweight 7B footprint for local experimentation",
        "Backed by OpenLLM-France community work",
        "Targeted training on authentic French dialogue data"
      ],
      "weaknesses": [
        "CC-BY-NC-SA license blocks most commercial use",
        "Tiny 2k context window by modern standards",
        "Built on aging Falcon-7B base",
        "Outclassed by Mistral and Qwen on general French tasks"
      ],
      "verdict": "A historically interesting French-dialogue specialist, but the restrictive license and 2k context make it a research-only pick today."
    }
  },
  {
    "id": "jamba-15-mini",
    "name": "Jamba 1.5 Mini",
    "author": "AI21 Labs",
    "origin": "il",
    "params": 52,
    "family": "Jamba",
    "license": "Jamba Open Model License",
    "tags": [
      "chat",
      "general",
      "moe",
      "multilingual"
    ],
    "ctx": 256000,
    "vram": {
      "q4": 30,
      "q5": 37,
      "q8": 55,
      "fp16": 104
    },
    "ramCpu": 48,
    "tokSec": {
      "low": 6,
      "mid": 20,
      "high": 50
    },
    "desc": "AI21 Labs' hybrid SSM-Transformer with MoE routing, activating 12B of 52B parameters. Delivers a verified 256k context window but ships under AI21's non-OSI Jamba license.",
    "best": [
      "Long-document workflows that genuinely use 200k+ tokens",
      "Multilingual chat across the 9 supported languages",
      "Benchmarking SSM-Transformer hybrids against pure attention models",
      "Use cases where the Jamba license terms are acceptable"
    ],
    "_en_extras": {
      "strengths": [
        "Effective 256k context (86% on RULER)",
        "Unique SSM-Transformer hybrid architecture",
        "Strong throughput vs. dense models of similar capability",
        "Solid 9-language coverage"
      ],
      "weaknesses": [
        "Custom Jamba license is not OSI-approved",
        "Partial llama.cpp support complicates local deployment",
        "Superseded by Jamba 1.6 and 1.7",
        "Smaller fine-tune ecosystem than Llama or Qwen"
      ],
      "verdict": "A novel hybrid with real long-context performance, now eclipsed by newer Jamba releases and gated by a non-standard license."
    }
  },
  {
    "id": "hunyuan-a13b",
    "name": "Hunyuan-A13B Instruct",
    "author": "Tencent",
    "origin": "cn",
    "params": 80,
    "family": "Hunyuan",
    "license": "Tencent Hunyuan License",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "moe"
    ],
    "ctx": 262144,
    "vram": {
      "q4": 48,
      "q5": 57,
      "q8": 85,
      "fp16": 160
    },
    "ramCpu": 72,
    "tokSec": {
      "low": 6,
      "mid": 20,
      "high": 50
    },
    "desc": "Tencent's fine-grained MoE activating 13B of 80B parameters, with dual fast/slow thinking modes and a 256k context. Released under Tencent's custom Hunyuan license.",
    "best": [
      "Reasoning-heavy tasks needing toggleable thinking modes",
      "Long-context analysis up to 256k tokens",
      "Cost-sensitive deployment of a frontier-class MoE",
      "Chinese-language production workloads"
    ],
    "_en_extras": {
      "strengths": [
        "Competitive with o1 and DeepSeek on mainstream benchmarks",
        "Native 256k context",
        "Dual fast/slow thinking for latency-quality tradeoffs",
        "Only 13B active parameters keeps inference cheap"
      ],
      "weaknesses": [
        "Tencent Hunyuan license has commercial restrictions",
        "No official Ollama distribution",
        "Tooling support trails Qwen and Llama"
      ],
      "verdict": "Frontier-tier MoE reasoning at a manageable active-parameter count, held back mainly by the custom Tencent license."
    }
  },
  {
    "id": "llava-onevision-7b",
    "name": "LLaVA-OneVision 7B",
    "author": "LMMs-Lab",
    "origin": "sg",
    "params": 7,
    "family": "LLaVA",
    "license": "Apache 2.0",
    "tags": [
      "vision",
      "chat"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 16
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 10,
      "mid": 30,
      "high": 80
    },
    "desc": "An Apache-licensed 7B vision-language model from LMMs-Lab, combining SigLIP SO400M with Qwen2-7B. Handles single images, multi-image inputs, and video at over 170k monthly downloads.",
    "best": [
      "Self-hosted VLM apps needing a permissive license",
      "Multi-image reasoning and short video understanding",
      "Fine-tuning base for domain-specific vision tasks",
      "Cost-sensitive image captioning and VQA pipelines"
    ],
    "_en_extras": {
      "strengths": [
        "Fully Apache 2.0 with no commercial gotchas",
        "Genuine multi-image and video support",
        "Mature ecosystem with strong community traction",
        "Solid Qwen2-7B language backbone"
      ],
      "weaknesses": [
        "No official Ollama packaging",
        "English-first; weaker on non-English vision QA",
        "Outpaced by Qwen3-VL on most 2025 benchmarks"
      ],
      "verdict": "A dependable, truly open VLM for self-hosters who value Apache licensing over the latest leaderboard score."
    }
  },
  {
    "id": "llava-onevision-72b",
    "name": "LLaVA-OneVision 72B",
    "author": "LMMs-Lab",
    "origin": "sg",
    "params": 72,
    "family": "LLaVA",
    "license": "Apache 2.0",
    "tags": [
      "vision",
      "chat"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 42,
      "q5": 50,
      "q8": 78,
      "fp16": 144
    },
    "ramCpu": 64,
    "tokSec": {
      "low": 1,
      "mid": 5,
      "high": 18
    },
    "desc": "The 72B Apache-licensed flagship from LMMs-Lab, built on Qwen2-72B with strong English and Chinese vision performance. A 2024 state-of-the-art open VLM.",
    "best": [
      "High-fidelity image and video understanding on owned hardware",
      "Commercial deployments needing Apache-licensed VLMs at scale",
      "Multi-image reasoning over technical documents",
      "Bilingual EN/CN visual tasks"
    ],
    "_en_extras": {
      "strengths": [
        "State-of-the-art open vision quality at 2024 release",
        "Robust multi-image and video reasoning",
        "Apache 2.0 with no usage restrictions",
        "Solid bilingual EN/CN coverage"
      ],
      "weaknesses": [
        "Around 42 GB VRAM at Q4 — needs serious GPU resources",
        "32k context limits long-document workflows",
        "Surpassed by Qwen3-VL 30B in 2025 benchmarks"
      ],
      "verdict": "A heavyweight Apache VLM that still delivers, though Qwen3-VL has since taken the open-vision crown at lower cost."
    }
  },
  {
    "id": "arctic-instruct",
    "name": "Snowflake Arctic Instruct",
    "author": "Snowflake",
    "origin": "us",
    "params": 480,
    "family": "Arctic",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "moe"
    ],
    "ctx": 4096,
    "vram": {
      "q4": 290,
      "q5": 345,
      "q8": 510,
      "fp16": 960
    },
    "ramCpu": 340,
    "tokSec": {
      "low": 1,
      "mid": 5,
      "high": 15
    },
    "desc": "Snowflake's hybrid Dense-MoE with 17B active parameters out of 480B total. Apache-licensed and tuned for enterprise analytics, but the 4k context shows its age.",
    "best": [
      "Enterprise SQL generation and analytical reasoning",
      "Workloads where 17B-active inference economics matter",
      "Research into Dense-MoE hybrid architectures",
      "Permissive-license deployments in data-warehouse stacks"
    ],
    "_en_extras": {
      "strengths": [
        "Highly efficient inference for its 480B total size",
        "Strong on SQL and analytical tasks",
        "Apache 2.0 with no commercial restrictions",
        "Battle-tested in enterprise scenarios"
      ],
      "weaknesses": [
        "Around 290 GB VRAM at Q4 — GPU cluster territory",
        "4k context is severely limiting in 2026",
        "Outclassed by modern MoEs across most benchmarks"
      ],
      "verdict": "A historically important enterprise MoE, but the 4k context and infrastructure demands push it out of contention for new deployments."
    }
  },
  {
    "id": "grok-1",
    "name": "Grok-1 (base)",
    "author": "xAI",
    "origin": "us",
    "params": 314,
    "family": "Grok",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "moe"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 188,
      "q5": 225,
      "q8": 335,
      "fp16": 630
    },
    "ramCpu": 240,
    "tokSec": {
      "low": 0.5,
      "mid": 2,
      "high": 8
    },
    "desc": "xAI's first open-weight release: a 314B MoE with about 86B active parameters under Apache 2.0. Base model only — no official instruction tuning shipped.",
    "best": [
      "Research into large-scale MoE architectures",
      "Custom fine-tuning projects with significant GPU budget",
      "Historical reference for xAI's open-weight lineage",
      "Apache-licensed base for downstream instruct training"
    ],
    "_en_extras": {
      "strengths": [
        "First open-weight model from xAI",
        "Apache 2.0 with full commercial freedom",
        "Efficient MoE design with top-2 routing across 8 experts",
        "Useful base for community fine-tunes"
      ],
      "weaknesses": [
        "Around 188 GB VRAM at Q4",
        "Raw base weights — no official instruct variant",
        "Comprehensively outpaced by Grok 2 and beyond",
        "Limited community fine-tunes vs. Llama or Qwen"
      ],
      "verdict": "A landmark open release that's now mostly a research artifact — pick a modern MoE for any real workload."
    }
  },
  {
    "id": "gpt-oss-120b",
    "name": "gpt-oss 120B",
    "author": "OpenAI",
    "origin": "us",
    "params": 117,
    "family": "gpt-oss",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "moe"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 70,
      "q5": 85,
      "q8": 125,
      "fp16": 234
    },
    "ramCpu": 100,
    "tokSec": {
      "low": 12,
      "mid": 35,
      "high": 90
    },
    "desc": "OpenAI's first open-weight return: a 117B MoE with 5.1B active parameters, matching o4-mini quality. Fits a single 80 GB GPU and ships under Apache 2.0.",
    "best": [
      "Production deployments wanting OpenAI quality on owned hardware",
      "Reasoning and coding workloads at frontier quality",
      "128k-context document analysis on a single 80 GB GPU",
      "Apache-licensed alternative to API-only o4-mini"
    ],
    "_en_extras": {
      "strengths": [
        "Matches o4-mini on reasoning and coding benchmarks",
        "Apache 2.0 license with full commercial use",
        "128k context out of the box",
        "Fits on a single 80 GB accelerator"
      ],
      "weaknesses": [
        "Around 70 GB VRAM at Q4 — multi-GPU for higher precision",
        "MoE deployment is operationally more complex than dense"
      ],
      "verdict": "The most consequential open-weight release in years — frontier OpenAI quality on a single GPU under Apache 2.0."
    }
  },
  {
    "id": "gpt-oss-20b",
    "name": "gpt-oss 20B",
    "author": "OpenAI",
    "origin": "us",
    "params": 21,
    "family": "gpt-oss",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "moe",
      "small"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 13,
      "q5": 16,
      "q8": 23,
      "fp16": 42
    },
    "ramCpu": 18,
    "tokSec": {
      "low": 20,
      "mid": 55,
      "high": 130
    },
    "desc": "OpenAI's compact open-weight MoE with 3.6B active out of 21B total parameters. Matches o3-mini on a laptop-class GPU under Apache 2.0.",
    "best": [
      "Local development on consumer or workstation GPUs",
      "Edge deployments needing frontier-vendor quality",
      "128k-context tasks without datacenter hardware",
      "Apache-licensed replacement for o3-mini API calls"
    ],
    "_en_extras": {
      "strengths": [
        "Apache 2.0 with full commercial freedom",
        "Around 13 GB VRAM at Q4 — runs on a 16 GB card",
        "OpenAI quality in an accessible footprint",
        "Native 128k context"
      ],
      "weaknesses": [
        "MoE format uses more VRAM than equivalent dense models",
        "Fewer community fine-tunes than Llama or Qwen"
      ],
      "verdict": "The clear default for local OpenAI-quality inference — accessible VRAM, 128k context, and a real license."
    }
  },
  {
    "id": "kimi-k26",
    "name": "Kimi K2.6",
    "author": "Moonshot AI",
    "origin": "cn",
    "params": 1000,
    "family": "Kimi",
    "license": "Modified MIT",
    "tags": [
      "chat",
      "general",
      "vision",
      "moe"
    ],
    "ctx": 256000,
    "vram": {
      "q4": 600,
      "q5": 720,
      "q8": 1080,
      "fp16": 2000
    },
    "ramCpu": 700,
    "tokSec": {
      "low": 1,
      "mid": 4,
      "high": 12
    },
    "desc": "Moonshot AI's April 2026 flagship: roughly 1T total parameters with 32B active, native multimodal, plus an agent-swarm mode coordinating up to 300 sub-agents.",
    "best": [
      "Frontier-tier agentic workloads with parallel sub-agents",
      "Long-context analysis up to 256k tokens",
      "Multimodal pipelines needing top-end open quality",
      "API-driven applications where local hosting isn't required"
    ],
    "_en_extras": {
      "strengths": [
        "1T total parameters with frontier-class performance",
        "Native 256k context window",
        "Unique 300-agent swarm coordination mode",
        "Multimodal across text and vision"
      ],
      "weaknesses": [
        "Around 600 GB VRAM at Q4 — datacenter only",
        "API-first; local hosting is impractical for most teams",
        "Modified MIT terms need legal review"
      ],
      "verdict": "A genuine frontier open-weight model, but you'll be consuming it via API unless you run a datacenter."
    }
  },
  {
    "id": "qwen3-vl-235b",
    "name": "Qwen 3 VL 235B-A22B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 235,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "vision",
      "chat",
      "general",
      "moe",
      "multilingual"
    ],
    "ctx": 262144,
    "vram": {
      "q4": 142,
      "q5": 170,
      "q8": 250,
      "fp16": 470
    },
    "ramCpu": 160,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 28
    },
    "desc": "Alibaba's flagship Qwen 3 vision model: 235B MoE with 22B active parameters and a native 256k context that extends to 1M. The current open-weight vision leader.",
    "best": [
      "Best-in-class open vision performance",
      "Long-context multimodal analysis (256k native, 1M extended)",
      "Document, chart, and video understanding at scale",
      "Apache-licensed alternative to closed multimodal APIs"
    ],
    "_en_extras": {
      "strengths": [
        "Top open-weight vision model as of May 2025",
        "262k native context, extensible to 1M tokens",
        "Apache 2.0 license",
        "Only 22B active parameters keeps inference tractable"
      ],
      "weaknesses": [
        "Around 142 GB VRAM at Q4 — multi-GPU required",
        "Heavier operational lift than dense alternatives",
        "Overkill for simple captioning workloads"
      ],
      "verdict": "The open-vision benchmark to beat — if you can afford the GPUs, this is the model to deploy."
    }
  },
  {
    "id": "qwen3-vl-30b",
    "name": "Qwen 3 VL 30B-A3B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 30,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "vision",
      "chat",
      "general",
      "moe",
      "multilingual"
    ],
    "ctx": 262144,
    "vram": {
      "q4": 19,
      "q5": 23,
      "q8": 35,
      "fp16": 62
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 15,
      "mid": 40,
      "high": 100
    },
    "desc": "Qwen 3 VL's sweet spot: a 30B MoE with 3B active parameters and 256k context. Delivers most of the 235B's quality at a fraction of the hardware cost.",
    "best": [
      "Single-GPU multimodal deployments",
      "Long-context document and chart analysis",
      "Cost-conscious teams wanting near-flagship vision",
      "Apache-licensed VLM for commercial products"
    ],
    "_en_extras": {
      "strengths": [
        "Around 19 GB VRAM at Q4 — fits a single 24 GB card",
        "Native 262k multimodal context",
        "Efficient MoE with only 3B active parameters",
        "Apache 2.0"
      ],
      "weaknesses": [
        "Lags the 235B on complex scene understanding",
        "Fewer fine-tunes than the older Qwen2-VL family"
      ],
      "verdict": "The pragmatic open-vision choice in 2026 — most of the flagship's quality on hardware most teams already own."
    }
  },
  {
    "id": "qwen3-vl-8b",
    "name": "Qwen 3 VL 8B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 8,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "vision",
      "chat",
      "general",
      "multilingual"
    ],
    "ctx": 262144,
    "vram": {
      "q4": 6,
      "q5": 7,
      "q8": 10,
      "fp16": 16
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 10,
      "mid": 30,
      "high": 80
    },
    "desc": "The dense 8B entry in Qwen 3 VL, offering strong OCR and document analysis with a remarkable 256k multimodal context for its size.",
    "best": [
      "On-device or edge multimodal inference",
      "OCR and structured document extraction",
      "Long-context multimodal tasks on modest GPUs",
      "Quick prototyping of VLM-powered features"
    ],
    "_en_extras": {
      "strengths": [
        "Around 6 GB VRAM at Q4 — runs almost anywhere",
        "262k multimodal context in an 8B model",
        "Solid OCR and document analysis",
        "Apache 2.0"
      ],
      "weaknesses": [
        "Trails the 30B variant on complex scene reasoning",
        "Limited capacity for advanced visual reasoning"
      ],
      "verdict": "The go-to small open VLM — Apache-licensed, long-context, and capable enough for most production document workflows."
    }
  },
  {
    "id": "ernie-45-300b",
    "name": "ERNIE 4.5 300B-A47B",
    "author": "Baidu",
    "origin": "cn",
    "params": 300,
    "family": "ERNIE",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "multilingual",
      "moe"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 180,
      "q5": 215,
      "q8": 320,
      "fp16": 600
    },
    "ramCpu": 220,
    "tokSec": {
      "low": 1,
      "mid": 5,
      "high": 15
    },
    "desc": "Baidu's first open release at frontier scale: a 300B MoE with 47B active parameters. Strongest open model for Chinese, with partial weight publication.",
    "best": [
      "Chinese-language production workloads at frontier quality",
      "Multilingual applications targeting East Asian markets",
      "Research benchmarking against Western open models",
      "Long-context analysis up to 128k tokens"
    ],
    "_en_extras": {
      "strengths": [
        "Best-in-class Chinese-language performance",
        "Efficient MoE inference with 47B active",
        "300B total parameters at frontier scale",
        "128k context window"
      ],
      "weaknesses": [
        "Around 180 GB VRAM at Q4",
        "Baidu license has commercial restrictions",
        "Limited adoption and support outside China",
        "Only partial weights publicly released"
      ],
      "verdict": "The strongest open model for Chinese workloads, but licensing and limited ecosystem outside China constrain its reach."
    }
  },
  {
    "id": "ernie-45-21b",
    "name": "ERNIE 4.5 21B-A3B Thinking",
    "author": "Baidu",
    "origin": "cn",
    "params": 21,
    "family": "ERNIE",
    "license": "Apache 2.0",
    "tags": [
      "reasoning",
      "moe"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 13,
      "q5": 16,
      "q8": 23,
      "fp16": 42
    },
    "ramCpu": 18,
    "tokSec": {
      "low": 15,
      "mid": 40,
      "high": 100
    },
    "desc": "Baidu's compact reasoning MoE with 3B active parameters out of 21B total. Fast inference thanks to the small active set, with Chinese-language strength.",
    "best": [
      "Cost-sensitive reasoning workloads",
      "Chinese-language reasoning tasks",
      "Single-GPU deployments needing 128k context",
      "Latency-sensitive applications"
    ],
    "_en_extras": {
      "strengths": [
        "Around 13 GB VRAM at Q4",
        "Compact MoE optimized for reasoning",
        "Strong Chinese-language performance",
        "128k context window"
      ],
      "weaknesses": [
        "Weaker multilingual coverage than Qwen",
        "Baidu license terms need verification",
        "Smaller community than Qwen or Llama equivalents"
      ],
      "verdict": "An efficient reasoning MoE with real Chinese strength, but Qwen's compact models remain easier to adopt outside China."
    }
  },
  {
    "id": "ring-1t",
    "name": "Ring-1T",
    "author": "Ant Group",
    "origin": "cn",
    "params": 1000,
    "family": "Ring",
    "license": "MIT",
    "tags": [
      "reasoning",
      "moe"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 600,
      "q5": 720,
      "q8": 1080,
      "fp16": 2000
    },
    "ramCpu": 700,
    "tokSec": {
      "low": 0.5,
      "mid": 3,
      "high": 10
    },
    "desc": "Ant Group's MIT-licensed open reasoner: 1T total parameters with 50B active, using a novel ring-all-reduce MoE architecture. Top of the open-reasoning leaderboards.",
    "best": [
      "Datacenter-scale reasoning workloads",
      "Research into novel MoE architectures",
      "Frontier benchmarking against closed reasoners",
      "Long-context reasoning up to 131k tokens"
    ],
    "_en_extras": {
      "strengths": [
        "First trillion-parameter Chinese open-weight model",
        "MIT license with full commercial freedom",
        "Original ring-MoE all-reduce architecture",
        "131k context window"
      ],
      "weaknesses": [
        "Around 600 GB VRAM at Q4 — datacenter only",
        "Commercial licensing for downstream use is complex",
        "Operationally heavy to deploy and tune"
      ],
      "verdict": "A frontier open reasoner with a permissive license — practical only for teams running real datacenter infrastructure."
    }
  },
  {
    "id": "seed-oss-36b",
    "name": "Seed-OSS 36B Instruct",
    "author": "ByteDance",
    "origin": "cn",
    "params": 36,
    "family": "Seed",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general"
    ],
    "ctx": 524288,
    "vram": {
      "q4": 22,
      "q5": 26,
      "q8": 40,
      "fp16": 72
    },
    "ramCpu": 36,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 28
    },
    "desc": "ByteDance's first major open release: a dense 36B model with a native 524k context — roughly 4× the competition. Apache 2.0.",
    "best": [
      "Extreme long-document analysis (codebases, books, transcripts)",
      "RAG-free workflows that load everything into context",
      "Dense-model deployments preferring predictable behavior",
      "Apache-licensed commercial use"
    ],
    "_en_extras": {
      "strengths": [
        "524k native context — a record for accessible dense models",
        "Dense 36B is easier to deploy than equivalent MoEs",
        "Strong long-document comprehension",
        "Apache 2.0"
      ],
      "weaknesses": [
        "Around 22 GB VRAM at Q4 (much more with full context)",
        "ByteDance license terms need a careful read",
        "Limited fine-tune ecosystem at launch"
      ],
      "verdict": "Unmatched long-context for a dense open model — the pick when you genuinely need to load 500k+ tokens at once."
    }
  },
  {
    "id": "exaone-45-33b",
    "name": "EXAONE 4.5 33B",
    "author": "LG AI Research",
    "origin": "kr",
    "params": 33,
    "family": "EXAONE",
    "license": "EXAONE AI Model License",
    "tags": [
      "chat",
      "general",
      "vision",
      "multilingual"
    ],
    "ctx": 262144,
    "vram": {
      "q4": 20,
      "q5": 24,
      "q8": 36,
      "fp16": 66
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 30
    },
    "desc": "LG AI Research's multimodal Korean flagship: a 33B model with 256k context that lands in the top 10 of the Artificial Analysis Intelligence Index.",
    "best": [
      "Korean-language production workloads",
      "Bilingual EN/KR multimodal applications",
      "Vision tasks needing a compact 33B footprint",
      "Long-context multimodal analysis"
    ],
    "_en_extras": {
      "strengths": [
        "262k context in a 33B model",
        "Integrated vision capabilities at this scale",
        "Strong Korean and English performance",
        "Top-10 placement on independent intelligence benchmarks"
      ],
      "weaknesses": [
        "Around 20 GB VRAM at Q4",
        "EXAONE license requires review for commercial use",
        "Smaller English-focused community than Llama or Qwen"
      ],
      "verdict": "The clear pick for Korean multimodal work — capable, compact, and competitive globally, with licensing caveats to verify."
    }
  },
  {
    "id": "nemotron-nano-3-30b",
    "name": "Nemotron Nano 3 30B-A3B",
    "author": "NVIDIA",
    "origin": "us",
    "params": 30,
    "family": "Nemotron",
    "license": "NVIDIA Open Model License",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "moe"
    ],
    "ctx": 1000000,
    "vram": {
      "q4": 19,
      "q5": 23,
      "q8": 35,
      "fp16": 62
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 15,
      "mid": 40,
      "high": 100
    },
    "desc": "NVIDIA's Mamba-2 + Transformer hybrid MoE with 3B active out of 30B total parameters. A native 1M-token context with roughly 4× the throughput of Nemotron 2.",
    "best": [
      "Million-token context workloads",
      "Edge and on-device inference at unusually long context",
      "Throughput-critical pipelines (RAG ingestion, log analysis)",
      "Hybrid SSM-Transformer research and benchmarking"
    ],
    "_en_extras": {
      "strengths": [
        "Native 1M-token context window",
        "Ultra-efficient MoE with only 3B active parameters",
        "Roughly 4× throughput improvement over Nemotron 2",
        "Permissive NVIDIA Open Model license"
      ],
      "weaknesses": [
        "Full 1M context consumes substantial VRAM in practice",
        "Hybrid architecture has thinner tooling support",
        "Distilled from Llama — inherits some base-model quirks"
      ],
      "verdict": "The throughput-and-context champion for edge MoE deployments — built for workloads where 128k context isn't enough."
    }
  },
  {
    "id": "nemotron-nano-v2-vl-12b",
    "name": "Nemotron Nano v2 VL 12B",
    "author": "NVIDIA",
    "origin": "us",
    "params": 12.6,
    "family": "Nemotron",
    "license": "NVIDIA Open Model License",
    "tags": [
      "vision",
      "chat"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 8,
      "q5": 10,
      "q8": 14,
      "fp16": 25
    },
    "ramCpu": 14,
    "tokSec": {
      "low": 7,
      "mid": 22,
      "high": 60
    },
    "desc": "NVIDIA's 12.6B enterprise VLM with strong DocVQA and ChartQA scores, tuned for professional document extraction workflows.",
    "best": [
      "Enterprise document extraction and DocVQA pipelines",
      "Chart and table understanding at production scale",
      "Single-GPU multimodal deployments",
      "Long-context multimodal tasks up to 128k tokens"
    ],
    "_en_extras": {
      "strengths": [
        "Combined vision and text in a 12B footprint",
        "128k context window",
        "Strong DocVQA and ChartQA benchmark scores",
        "NVIDIA Open Model license"
      ],
      "weaknesses": [
        "Trails Qwen3-VL 30B on complex visual reasoning",
        "NVIDIA license terms differ from Apache or MIT",
        "Smaller community than Qwen or LLaVA families"
      ],
      "verdict": "A focused enterprise VLM that punches above its weight on documents and charts — the right call when extraction is the job."
    }
  },
  {
    "id": "apertus-70b",
    "name": "Apertus 70B",
    "author": "Swiss AI",
    "origin": "ch",
    "params": 70,
    "family": "Apertus",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "multilingual",
      "fr"
    ],
    "ctx": 65536,
    "vram": {
      "q4": 40,
      "q5": 48,
      "q8": 75,
      "fp16": 140
    },
    "ramCpu": 64,
    "tokSec": {
      "low": 1,
      "mid": 6,
      "high": 20
    },
    "desc": "A Swiss AI joint effort (EPFL, ETH, CSCS) trained on 15T tokens covering 1000+ languages, including Swiss German and Romansh. Apache 2.0.",
    "best": [
      "European data-sovereignty-critical deployments",
      "Applications serving French, German, Italian, or Romansh users",
      "Research on broadly multilingual training",
      "Apache-licensed alternatives to US or Chinese flagships"
    ],
    "_en_extras": {
      "strengths": [
        "European data sovereignty story",
        "Only flagship model with native Romansh support",
        "Apache 2.0 license",
        "Strong across Alpine and broader European languages"
      ],
      "weaknesses": [
        "Around 40 GB VRAM at Q4 — multi-GPU required",
        "Smaller fine-tune ecosystem than Llama or Qwen",
        "English performance trails best-in-class US models"
      ],
      "verdict": "Europe's most credible sovereign open flagship — pick it when language coverage or data jurisdiction matters more than raw English benchmarks."
    }
  },
  {
    "id": "apertus-8b",
    "name": "Apertus 8B",
    "author": "Swiss AI",
    "origin": "ch",
    "params": 8,
    "family": "Apertus",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "multilingual",
      "fr"
    ],
    "ctx": 65536,
    "vram": {
      "q4": 6,
      "q5": 7,
      "q8": 10,
      "fp16": 16
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 10,
      "mid": 30,
      "high": 80
    },
    "desc": "The compact Swiss AI release trained on the Alps supercomputer, covering 1000+ languages including Swiss German and Romansh. Apache 2.0.",
    "best": [
      "Local multilingual EU deployments",
      "On-device assistants for French, German, Italian, or Romansh",
      "Data-sovereignty-sensitive prototypes",
      "Apache-licensed baseline for European fine-tuning"
    ],
    "_en_extras": {
      "strengths": [
        "Around 6 GB VRAM at Q4 — runs on consumer hardware",
        "Native EU multilingual coverage",
        "Apache 2.0 license",
        "Practical for everyday assistant use"
      ],
      "weaknesses": [
        "Trails Qwen 3 8B on English and coding tasks",
        "Limited public fine-tunes",
        "Less benchmark coverage than mainstream 8B models"
      ],
      "verdict": "The accessible sovereign 8B for European multilingual work — choose it when language reach beats benchmark dominance."
    }
  },
  {
    "id": "trinity-mini-26b",
    "name": "Trinity Mini 26B-A3B",
    "author": "Arcee AI",
    "origin": "us",
    "params": 26,
    "family": "Trinity",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "moe"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 15,
      "q5": 18,
      "q8": 28,
      "fp16": 52
    },
    "ramCpu": 24,
    "tokSec": {
      "low": 15,
      "mid": 40,
      "high": 100
    },
    "desc": "Arcee AI's US-built MoE with 3B active parameters out of 26B total. Apache-licensed, fast in practice, and tuned for agent-style workloads.",
    "best": [
      "Agent frameworks needing fast, capable open models",
      "Enterprise deployments preferring US-based vendors",
      "Single-GPU inference with 128k context",
      "Apache-licensed MoE for commercial products"
    ],
    "_en_extras": {
      "strengths": [
        "Efficient MoE with around 3.5B active parameters",
        "131k context window",
        "Tuned for agent and tool-use workflows",
        "Apache 2.0"
      ],
      "weaknesses": [
        "Limited public benchmark coverage",
        "Less name recognition than Mistral or Qwen",
        "Smaller fine-tune ecosystem"
      ],
      "verdict": "A solid US-built MoE for agent work — worth a serious look if you value Apache licensing and a domestic vendor."
    }
  },
  {
    "id": "hunyuan-20-large",
    "name": "Hunyuan Large 2.0",
    "author": "Tencent",
    "origin": "cn",
    "params": 406,
    "family": "Hunyuan",
    "license": "Tencent Hunyuan License",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "moe"
    ],
    "ctx": 262144,
    "vram": {
      "q4": 245,
      "q5": 290,
      "q8": 435,
      "fp16": 810
    },
    "ramCpu": 300,
    "tokSec": {
      "low": 1,
      "mid": 4,
      "high": 12
    },
    "desc": "Tencent's 406B flagship MoE with 32B active parameters and 256k context. Strong on Chinese and English, but gated by the custom Tencent Hunyuan license.",
    "best": [
      "Frontier Chinese-language production workloads",
      "Long-context RAG and document analysis up to 256k tokens",
      "Bilingual EN/CN enterprise deployments",
      "Use cases compatible with the Tencent Hunyuan license"
    ],
    "_en_extras": {
      "strengths": [
        "262k native context window",
        "Top-tier Chinese-language performance",
        "Efficient inference relative to 406B total size",
        "Strong on RAG and long-document tasks"
      ],
      "weaknesses": [
        "Around 245 GB VRAM at Q4 — heavy infrastructure needed",
        "Custom Tencent license requires careful legal review",
        "Limited adoption outside Chinese-speaking markets"
      ],
      "verdict": "Frontier-class bilingual long-context performance, but licensing and infrastructure demands narrow its practical audience."
    }
  },
  {
    "id": "internvl-35-8b",
    "name": "InternVL 3.5 8B",
    "author": "OpenGVLab",
    "origin": "cn",
    "params": 8,
    "family": "InternVL",
    "license": "Apache 2.0",
    "tags": [
      "vision",
      "chat"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 6,
      "q5": 7,
      "q8": 10,
      "fp16": 16
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 10,
      "mid": 30,
      "high": 80
    },
    "desc": "OpenGVLab's 8B vision-language model leading MMMU among open models. Built at Shanghai AI Lab and released under Apache 2.0.",
    "best": [
      "Best-in-class 8B vision for OCR and chart understanding",
      "Single-GPU multimodal deployments",
      "Document and PDF analysis pipelines",
      "Apache-licensed VLM for commercial products"
    ],
    "_en_extras": {
      "strengths": [
        "Top quality-per-parameter ratio in 8B vision",
        "Strong OCR and chart understanding",
        "Apache 2.0 license",
        "Solid VQA and short-video performance"
      ],
      "weaknesses": [
        "32k context limits long-document multimodal work",
        "Weaker multilingual coverage than Qwen2-VL",
        "No native long-context extension"
      ],
      "verdict": "The benchmark-leading small open VLM for OCR and charts — the right pick when you need accuracy more than context length."
    }
  },
  {
    "id": "mimo-v2-flash",
    "name": "MiMo V2 Flash",
    "author": "Xiaomi",
    "origin": "cn",
    "params": 309,
    "family": "MiMo",
    "license": "MIT",
    "tags": [
      "chat",
      "code",
      "moe"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 185,
      "q5": 222,
      "q8": 330,
      "fp16": 618
    },
    "ramCpu": 230,
    "tokSec": {
      "low": 1,
      "mid": 5,
      "high": 15
    },
    "desc": "Xiaomi's 309B-parameter sparse MoE (52B active) released under MIT, topping SWE-Bench Verified at 73.4% at launch. Built for heavy-duty code and reasoning work.",
    "best": [
      "Self-hosted coding agents that need frontier SWE-Bench accuracy",
      "Refactoring and bug-fixing pipelines over large repos",
      "Long-context code review (up to 128k tokens)",
      "MIT-licensed deployments where commercial use is non-negotiable",
      "Teams with multi-GPU infrastructure willing to trade VRAM for quality"
    ],
    "_en_extras": {
      "strengths": [
        "State-of-the-art SWE-Bench Verified score (73.4%) at release",
        "MoE design activates only 52B of 309B params, lowering inference cost",
        "128k context window suits whole-repo reasoning",
        "Permissive MIT license for commercial deployment",
        "Architecture borrows from DeepSeek's proven MoE recipe"
      ],
      "weaknesses": [
        "Requires roughly 185 GB VRAM in Q4 — multi-GPU or H100-class hardware",
        "Xiaomi's open-weight licensing is newer and worth a legal review",
        "Newer architecture may lag in tooling support outside vLLM"
      ],
      "verdict": "If you need an MIT-licensed, top-of-the-leaderboard coding model and have the GPUs to run it, MiMo V2 Flash is the pick."
    }
  },
  {
    "id": "rakuten-ai-3",
    "name": "Rakuten AI 3.0",
    "author": "Rakuten",
    "origin": "jp",
    "params": 700,
    "family": "Rakuten",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "multilingual",
      "moe"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 420,
      "q5": 500,
      "q8": 745,
      "fp16": 1400
    },
    "ramCpu": 500,
    "tokSec": {
      "low": 1,
      "mid": 4,
      "high": 12
    },
    "desc": "Rakuten's flagship ~700B MoE model built under Japan's GENIAC program and released under Apache 2.0. Best-in-class Japanese performance with serious enterprise e-commerce DNA.",
    "best": [
      "Japanese-language production workloads (support, content, search)",
      "Bilingual JP/EN e-commerce and retail applications",
      "Enterprise deployments needing Apache 2.0 licensing",
      "Replacing closed JP models like proprietary Rakuten/LINE APIs",
      "Localization pipelines targeting the Japanese market"
    ],
    "_en_extras": {
      "strengths": [
        "Top-tier Japanese fluency, beating most open models on JP benchmarks",
        "700B total parameters give it broad knowledge depth",
        "Apache 2.0 — no commercial restrictions",
        "Backed by Rakuten's massive e-commerce and fintech corpus",
        "Built under Japan's GENIAC sovereign-AI initiative"
      ],
      "weaknesses": [
        "Roughly 420 GB VRAM in Q4 — datacenter-only",
        "32k context is tight versus modern 128k+ flagships",
        "Heavily skewed toward Japanese and commerce; weaker on global general tasks"
      ],
      "verdict": "The default open-weight choice for Japanese enterprise; overkill and underspecialized for anyone else."
    }
  },
  {
    "id": "kanana-2-30b",
    "name": "Kanana 2 30B-A3B Thinking",
    "author": "Kakao",
    "origin": "kr",
    "params": 30,
    "family": "Kanana",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "multilingual",
      "moe"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 18,
      "q5": 22,
      "q8": 33,
      "fp16": 60
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 15,
      "mid": 40,
      "high": 100
    },
    "desc": "Kakao's agentic 30B MoE (3B active) with native hybrid thinking and Korean-first training. Apache 2.0 with MLA attention and 131k context.",
    "best": [
      "Korean-language products from chat to content generation",
      "Multilingual deployments covering KR/EN/JP/ZH/TH/VI",
      "Agentic workflows that benefit from a togglable thinking mode",
      "Long-document analysis up to 131k tokens",
      "Apache 2.0 commercial use on a single 24GB GPU"
    ],
    "_en_extras": {
      "strengths": [
        "131k context window in a 30B MoE",
        "Hybrid thinking/non-thinking mode toggle",
        "Native Korean performance backed by Kakao's corpus",
        "MLA attention cuts KV-cache footprint",
        "Apache 2.0 with only 3B active params per token"
      ],
      "weaknesses": [
        "Around 18 GB VRAM in Q4 — fits a single GPU but tight on consumer cards",
        "Quality drops outside Korean and English"
      ],
      "verdict": "The strongest open Korean model right now, with thinking mode and a sane VRAM budget on the side."
    }
  },
  {
    "id": "deepseek-ocr",
    "name": "DeepSeek-OCR",
    "author": "DeepSeek",
    "origin": "cn",
    "params": 3,
    "family": "DeepSeek",
    "license": "MIT",
    "tags": [
      "vision",
      "chat",
      "small"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 2,
      "q5": 2.5,
      "q8": 4,
      "fp16": 6
    },
    "ramCpu": 5,
    "tokSec": {
      "low": 25,
      "mid": 70,
      "high": 160
    },
    "desc": "DeepSeek's 3B MIT-licensed OCR specialist built on DeepEncoder, notable for its 'optical compression' approach. Punches well above its weight on documents, LaTeX, and tables.",
    "best": [
      "High-volume document OCR pipelines",
      "Extracting LaTeX formulas from scientific papers",
      "Parsing tables from PDFs, scans, and receipts",
      "Edge deployments needing OCR in ~2 GB VRAM",
      "MIT-licensed alternative to closed OCR APIs"
    ],
    "_en_extras": {
      "strengths": [
        "Best-in-class OCR quality at only 3B parameters",
        "Handles LaTeX formulas and table structure cleanly",
        "Runs in ~2 GB VRAM at Q4 — fits anywhere",
        "MIT license, no commercial restrictions",
        "Optical-compression approach reduces token usage on long documents"
      ],
      "weaknesses": [
        "8k context limits multi-page document handling",
        "OCR-only — not a general-purpose VLM",
        "Limited reasoning capability beyond extraction"
      ],
      "verdict": "Drop-in MIT OCR engine that beats far larger general VLMs at extraction tasks."
    }
  },
  {
    "id": "hunyuan-ocr-1b",
    "name": "HunyuanOCR 1B",
    "author": "Tencent",
    "origin": "cn",
    "params": 1,
    "family": "Hunyuan",
    "license": "Tencent Hunyuan License",
    "tags": [
      "vision",
      "chat",
      "small"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 0.8,
      "q5": 1,
      "q8": 1.5,
      "fp16": 2
    },
    "ramCpu": 3,
    "tokSec": {
      "low": 50,
      "mid": 150,
      "high": 300
    },
    "desc": "Tencent's 1B end-to-end OCR model that outperforms 235B general VLMs on document tasks. Engineered for edge and mobile deployment.",
    "best": [
      "On-device or mobile OCR with strict memory budgets",
      "High-throughput batch OCR where latency matters",
      "Receipt, invoice, and form processing at scale",
      "Embedded systems and edge gateways",
      "Cost-sensitive OCR pipelines replacing cloud APIs"
    ],
    "_en_extras": {
      "strengths": [
        "Runs in under 1 GB VRAM at Q4",
        "Beats 200B+ general VLMs on document benchmarks",
        "End-to-end model — no separate detection/recognition stages",
        "Latency low enough for real-time mobile use"
      ],
      "weaknesses": [
        "1B ceiling shows on noisy or complex layouts",
        "8k context limits multi-page workflows",
        "Tencent Hunyuan License is custom — review before commercial use"
      ],
      "verdict": "The OCR model to pick when every megabyte counts; for messy real-world documents, step up to DeepSeek-OCR."
    }
  },
  {
    "id": "gemma4-26b-moe",
    "name": "Gemma 4 26B-A4B MoE",
    "author": "Google",
    "origin": "us",
    "params": 26,
    "family": "Gemma",
    "license": "Gemma",
    "tags": [
      "chat",
      "general",
      "vision",
      "audio",
      "multilingual",
      "moe"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 16,
      "q5": 19,
      "q8": 28,
      "fp16": 52
    },
    "ramCpu": 28,
    "tokSec": {
      "low": 8,
      "mid": 22,
      "high": 60
    },
    "desc": "Google's MoE variant of Gemma 4 with 26B total / 4B active params and full text+image+audio multimodality. The smallest open model with native audio understanding at this quality.",
    "best": [
      "Multimodal apps that need text, image, and audio in one model",
      "Voice-driven assistants and audio analysis pipelines",
      "Long-context reasoning over mixed-media inputs (128k)",
      "On-prem deployments where Google's tooling integrates cleanly",
      "Replacing three separate models with one"
    ],
    "_en_extras": {
      "strengths": [
        "Unified text, image, and audio in 26B/4B-active MoE",
        "128k context",
        "Strong reasoning relative to size",
        "Backed by Google's training infrastructure and corpus",
        "4B active params keep inference cheap"
      ],
      "weaknesses": [
        "Around 16 GB VRAM in Q4",
        "Gated on Hugging Face with click-through agreement",
        "Gemma license has more restrictions than Apache or MIT"
      ],
      "verdict": "The most capable open multimodal model under 30B if you can live with the Gemma license."
    }
  },
  {
    "id": "dots-llm1",
    "name": "dots.llm1 Instruct",
    "author": "Rednote",
    "origin": "cn",
    "params": 142,
    "family": "dots",
    "license": "MIT",
    "tags": [
      "chat",
      "general",
      "moe"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 85,
      "q5": 102,
      "q8": 152,
      "fp16": 284
    },
    "ramCpu": 120,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 30
    },
    "desc": "Xiaohongshu's first LLM under the Rednote brand — a 142B MoE with 14B active params trained without synthetic data, matching Qwen2.5-72B. Released under MIT.",
    "best": [
      "Creative and lifestyle content generation",
      "Chinese-language social and consumer-facing products",
      "Research on training without synthetic data",
      "MIT-licensed alternative to Qwen for content-heavy use cases",
      "Workloads where natural, non-generic prose matters"
    ],
    "_en_extras": {
      "strengths": [
        "14B active params in a 142B MoE — efficient inference",
        "MIT license",
        "Strong creative and lifestyle content generation",
        "No synthetic data in training — more natural outputs"
      ],
      "weaknesses": [
        "Roughly 85 GB VRAM in Q4 — multi-GPU territory",
        "32k context lags modern flagships",
        "Output style optimized for Chinese social media — may not fit Western tone"
      ],
      "verdict": "An MIT-licensed alternative for creative Chinese content; outside that niche, Qwen3 is the safer pick."
    }
  },
  {
    "id": "qwen3-omni-30b",
    "name": "Qwen 3 Omni 30B-A3B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 30,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "vision",
      "audio",
      "chat",
      "moe"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 19,
      "q5": 23,
      "q8": 35,
      "fp16": 62
    },
    "ramCpu": 32,
    "tokSec": {
      "low": 15,
      "mid": 40,
      "high": 100
    },
    "desc": "Alibaba's omni-modal 30B MoE (3B active) with streaming speech, 119-language ASR, and Apache 2.0 licensing. The most accessible truly omnimodal open model.",
    "best": [
      "Voice-first assistants with low-latency speech in/out",
      "Multilingual ASR across 119 languages",
      "Real-time multimodal agents on a single GPU",
      "Long-context multimodal reasoning (131k)",
      "Apache 2.0 commercial deployments"
    ],
    "_en_extras": {
      "strengths": [
        "Native omnimodal I/O: text, image, audio in and out",
        "131k context",
        "Streaming speech for low-latency voice apps",
        "Apache 2.0 license",
        "Only 3B active params per token"
      ],
      "weaknesses": [
        "Around 19 GB VRAM in Q4",
        "Audio path is still maturing relative to text and vision",
        "Tooling support uneven outside vLLM"
      ],
      "verdict": "The default open choice if you actually need audio in and out, not just text and images."
    }
  },
  {
    "id": "qwen35-122b-a10b",
    "name": "Qwen 3.5 122B-A10B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 122,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "multilingual",
      "moe"
    ],
    "ctx": 262000,
    "vram": {
      "q4": 73,
      "q5": 88,
      "q8": 131,
      "fp16": 244
    },
    "ramCpu": 110,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 28
    },
    "desc": "Alibaba's mid-flagship Qwen 3.5 with 122B total / 10B active params and 262k native context. Frontier-class quality that fits on a single H100.",
    "best": [
      "Frontier-quality inference on a single H100",
      "Long-context document and codebase analysis (262k)",
      "Multilingual reasoning workloads",
      "Apache 2.0 deployments where Qwen 397B is overkill",
      "Cost-sensitive agentic systems needing top-tier quality"
    ],
    "_en_extras": {
      "strengths": [
        "Frontier-class quality with only 10B active params",
        "262k native context window",
        "Apache 2.0",
        "Single-H100 deployment is realistic",
        "Strong multilingual coverage"
      ],
      "weaknesses": [
        "Roughly 73 GB VRAM in Q4 — still needs multi-GPU on consumer cards",
        "Mid-flagship positioning means it's eclipsed by 397B on the hardest tasks"
      ],
      "verdict": "The sweet spot of the Qwen 3.5 lineup: H100-friendly with frontier-grade output."
    }
  },
  {
    "id": "pangu-pro-moe-72b",
    "name": "Pangu Pro MoE 72B",
    "author": "Huawei",
    "origin": "cn",
    "params": 72,
    "family": "Pangu",
    "license": "Pangu Model License",
    "tags": [
      "chat",
      "general",
      "moe"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 42,
      "q5": 50,
      "q8": 78,
      "fp16": 144
    },
    "ramCpu": 72,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 28
    },
    "desc": "Huawei's first open-weight release, a 72B MoE optimized for Ascend silicon. Strong on enterprise code and Chinese business scenarios, but the custom Pangu license needs careful review.",
    "best": [
      "Deployments already running on Huawei Ascend hardware",
      "Enterprise code and business workflows in Chinese markets",
      "Research on non-NVIDIA training and inference stacks",
      "Workloads where Huawei's ecosystem integration matters"
    ],
    "_en_extras": {
      "strengths": [
        "First-class optimization for Ascend NPUs",
        "Solid enterprise code and business reasoning",
        "Open weights from a major hyperscaler",
        "MoE design keeps inference tractable"
      ],
      "weaknesses": [
        "Around 42 GB VRAM in Q4",
        "32k context trails modern flagships",
        "Custom Pangu license requires legal review",
        "Tooling outside Huawei's stack is thin"
      ],
      "verdict": "A reasonable pick if you're on Ascend; on NVIDIA hardware, Qwen 3.5 or DeepSeek will serve you better."
    }
  },
  {
    "id": "qwen36-27b",
    "name": "Qwen 3.6 27B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 27,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "code",
      "reasoning",
      "vision",
      "multilingual"
    ],
    "ctx": 262144,
    "vram": {
      "q4": 16,
      "q5": 19,
      "q8": 29,
      "fp16": 54
    },
    "ramCpu": 28,
    "tokSec": {
      "low": 3,
      "mid": 13,
      "high": 32
    },
    "desc": "Dense 27B multimodal model from Alibaba (April 2026), scoring 77.2% on SWE-bench Verified with 262k native context (1M via YaRN). The Qwen 3.6 generation's developer-friendly workhorse.",
    "best": [
      "Coding agents needing top-tier SWE-bench accuracy at single-GPU scale",
      "Multimodal applications with long context (up to 1M with YaRN)",
      "Apache 2.0 deployments replacing closed APIs",
      "Reasoning workloads where dense models behave more predictably than MoE",
      "Local inference on a single 24-32 GB GPU"
    ],
    "_en_extras": {
      "strengths": [
        "77.2% SWE-bench Verified — frontier coding accuracy",
        "Native multimodal text + image",
        "262k context, extendable to 1M with YaRN",
        "Apache 2.0",
        "Dense 27B fits comfortably on consumer hardware"
      ],
      "weaknesses": [
        "Needs 16+ GB VRAM at Q4",
        "Hybrid architecture requires a recent llama.cpp build",
        "Dense design means no MoE inference efficiency"
      ],
      "verdict": "The single-GPU coding model to beat in 2026 — Apache 2.0, multimodal, and frontier-grade on SWE-bench."
    }
  },
  {
    "id": "deepseek-v4-pro",
    "name": "DeepSeek V4 Pro 1.6T",
    "author": "DeepSeek",
    "origin": "cn",
    "params": 1600,
    "family": "DeepSeek",
    "license": "MIT",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "moe",
      "multilingual"
    ],
    "ctx": 1000000,
    "vram": {
      "q4": 960,
      "q5": 1150,
      "q8": 1700,
      "fp16": 3200
    },
    "ramCpu": 1100,
    "tokSec": {
      "low": 0.5,
      "mid": 2,
      "high": 8
    },
    "desc": "DeepSeek's frontier MoE: 1.6T total / 49B active params, MIT-licensed, 1M context, with CSA+HCA hybrid attention and three reasoning modes. The absolute open-weight ceiling as of April 2026.",
    "best": [
      "Research labs benchmarking against closed frontier models",
      "Workloads where MIT licensing on frontier quality is the goal",
      "Million-token context tasks (whole codebases, books, archives)",
      "Multi-mode reasoning workflows (Non / High / Max)",
      "Datacenter deployments that can absorb ~1 TB VRAM"
    ],
    "_en_extras": {
      "strengths": [
        "The most capable open-weight model available, period",
        "MIT license at frontier scale",
        "1M context window",
        "Three configurable thinking modes (Non / High / Max)",
        "Hybrid CSA+HCA attention for efficient long-context"
      ],
      "weaknesses": [
        "960+ GB VRAM in Q4 — server farm only",
        "No community quantizations yet at release",
        "Three-mode reasoning adds inference complexity",
        "32T+ token pretraining means very high training carbon footprint"
      ],
      "verdict": "The new open-weight ceiling. If you have the hardware, nothing else comes close."
    }
  },
  {
    "id": "deepseek-v4-flash",
    "name": "DeepSeek V4 Flash 284B",
    "author": "DeepSeek",
    "origin": "cn",
    "params": 284,
    "family": "DeepSeek",
    "license": "MIT",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "moe",
      "multilingual"
    ],
    "ctx": 1000000,
    "vram": {
      "q4": 170,
      "q5": 205,
      "q8": 305,
      "fp16": 568
    },
    "ramCpu": 200,
    "tokSec": {
      "low": 2,
      "mid": 8,
      "high": 22
    },
    "desc": "DeepSeek V4's efficient sibling: 284B MoE with 13B active params, MIT-licensed, 1M context, and the same three-mode reasoning stack. Frontier-adjacent quality at a fraction of the inference cost.",
    "best": [
      "Frontier-class reasoning at single-server scale",
      "Million-token context analysis without datacenter budgets",
      "MIT-licensed alternatives to V4 Pro",
      "Workloads choosing between Base and Instruct variants",
      "Cost-sensitive deployments needing three thinking modes"
    ],
    "_en_extras": {
      "strengths": [
        "MIT license",
        "1M context window",
        "Only 13B active params — fast for its total size",
        "Three thinking modes inherited from V4 Pro",
        "Base and Instruct variants available"
      ],
      "weaknesses": [
        "Around 170 GB VRAM in Q4 — still multi-GPU",
        "Official community quantizations were lagging at launch",
        "Quality trails V4 Pro on the hardest reasoning tasks"
      ],
      "verdict": "The efficient way into the V4 family — MIT, 1M context, and inference cost that won't bankrupt you."
    }
  },
  {
    "id": "tencent-hy3-preview",
    "name": "Tencent Hy3 Preview 295B",
    "author": "Tencent",
    "origin": "cn",
    "params": 295,
    "family": "Hunyuan",
    "license": "Tencent Hunyuan License",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "moe"
    ],
    "ctx": 256000,
    "vram": {
      "q4": 177,
      "q5": 210,
      "q8": 315,
      "fp16": 590
    },
    "ramCpu": 200,
    "tokSec": {
      "low": 2,
      "mid": 8,
      "high": 25
    },
    "desc": "Tencent's frontier preview: 295B MoE with 21B active params plus a 3.8B MTP module, 80 layers, top-8 of 192 experts, with fused fast/slow thinking. Released April 2026 under the custom Hunyuan license.",
    "best": [
      "Research on fused fast/slow-thinking architectures",
      "Long-context workloads up to 256k tokens",
      "Base or Instruct fine-tuning at frontier scale",
      "Deployments where Tencent's Hunyuan license is acceptable",
      "Comparing Chinese hyperscaler frontier weights"
    ],
    "_en_extras": {
      "strengths": [
        "Tencent's first frontier-scale open-weight release",
        "256k context window",
        "Both Base and Instruct variants shipped",
        "MTP module accelerates long-form generation",
        "Fused fast/slow thinking in one model"
      ],
      "weaknesses": [
        "Custom Tencent Hunyuan Community License — legal review required",
        "Around 177 GB VRAM in Q4",
        "No Ollama support at launch",
        "Preview status means rough edges in tooling"
      ],
      "verdict": "A serious frontier preview from Tencent, held back from broader adoption by its custom license."
    }
  },
  {
    "id": "llada2-uni",
    "name": "LLaDA 2.0 Uni 16B",
    "author": "Ant Group / inclusionAI",
    "origin": "cn",
    "params": 16,
    "family": "LLaDA",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "vision",
      "general",
      "moe"
    ],
    "ctx": 8192,
    "vram": {
      "q4": 18,
      "q5": 22,
      "q8": 30,
      "fp16": 47
    },
    "ramCpu": 36,
    "tokSec": {
      "low": 25,
      "mid": 60,
      "high": 130
    },
    "desc": "Ant Group's first open Apache 2.0 diffusion LLM: a 16B/1B MoE paired with a 6.2B diffusion decoder, unifying text and vision generation and editing. Released April 2026.",
    "best": [
      "Research on diffusion-based language models",
      "Unified text + image generation and editing in one stack",
      "Interleaved thinking workflows during generation",
      "Apache 2.0 commercial use of dLLM architectures",
      "Experiments comparing diffusion vs. autoregressive decoding"
    ],
    "_en_extras": {
      "strengths": [
        "The first Apache 2.0 open diffusion LLM",
        "Unified text, vision, generation, and editing",
        "Interleaved 'thinking' mode during diffusion",
        "Decoder-turbo distillation runs 8 diffusion steps instead of 50",
        "Apache 2.0 commercial license"
      ],
      "weaknesses": [
        "Diffusion architecture not supported by Ollama or llama.cpp",
        "Requires Flash Attention 2 and CUDA 12.4",
        "Around 47 GB VRAM during active generation",
        "Only 8k context window"
      ],
      "verdict": "A research-first release that proves Apache 2.0 dLLMs are real — production users should wait for tooling to catch up."
    }
  },
  {
    "id": "mimo-v25-pro",
    "name": "MiMo V2.5 Pro",
    "author": "Xiaomi",
    "origin": "cn",
    "params": 1020,
    "family": "MiMo",
    "license": "MIT",
    "tags": [
      "chat",
      "reasoning",
      "code",
      "moe",
      "multilingual"
    ],
    "ctx": 1000000,
    "vram": {
      "q4": 595,
      "q5": 720,
      "q8": 1090,
      "fp16": 2040
    },
    "ramCpu": 700,
    "tokSec": {
      "low": 1,
      "mid": 4,
      "high": 12
    },
    "desc": "Xiaomi's MIT-licensed frontier agentic model: 1.02T MoE with 42B active params, 57.2% on SWE-Bench Pro, 1M context, and 6:1 hybrid attention. Released April 2026.",
    "best": [
      "Frontier autonomous coding agents at MIT licensing",
      "Workflows chaining 1,000+ tool calls in a single session",
      "Million-token codebase reasoning",
      "Research on multi-teacher distillation outcomes",
      "Datacenter deployments seeking the open agentic ceiling"
    ],
    "_en_extras": {
      "strengths": [
        "MIT license at frontier agentic scale",
        "1M context window",
        "Supports 1,000+ tool calls per chain",
        "57.2% on SWE-Bench Pro",
        "Hybrid 6:1 attention cuts KV-cache by 7x vs. full attention"
      ],
      "weaknesses": [
        "Roughly 600 GB VRAM in Q4 — datacenter only",
        "No official Ollama quantization",
        "MTP support is uneven across inference engines"
      ],
      "verdict": "The open agentic frontier — MIT, million-token, thousand-call — if you have the silicon to run it."
    }
  },
  {
    "id": "mimo-v25",
    "name": "MiMo V2.5",
    "author": "Xiaomi",
    "origin": "cn",
    "params": 310,
    "family": "MiMo",
    "license": "MIT",
    "tags": [
      "chat",
      "vision",
      "audio",
      "moe",
      "multilingual"
    ],
    "ctx": 1000000,
    "vram": {
      "q4": 180,
      "q5": 220,
      "q8": 330,
      "fp16": 620
    },
    "ramCpu": 230,
    "tokSec": {
      "low": 1,
      "mid": 5,
      "high": 15
    },
    "desc": "Xiaomi's MIT-licensed omnimodal model: 310B MoE with 15B active params handling text, image, video, and audio. Scores 87.7 on Video-MME with 1M context. Released April 2026.",
    "best": [
      "Video understanding pipelines (Video-MME 87.7)",
      "Unified text, image, video, and audio workflows",
      "Million-token multimodal context tasks",
      "MIT-licensed alternative to closed omnimodal APIs",
      "Document and chart reasoning (CharXiv RQ 81.0)"
    ],
    "_en_extras": {
      "strengths": [
        "Omnimodal under MIT — text, image, video, audio",
        "1M context window",
        "87.7 Video-MME and 81.0 CharXiv RQ",
        "Permissive MIT license at frontier scale",
        "MoE design keeps active compute reasonable"
      ],
      "weaknesses": [
        "Around 180 GB VRAM in Q4",
        "Video and audio inference pipelines are not yet standardized",
        "No Ollama support"
      ],
      "verdict": "The first MIT-licensed model that genuinely handles video alongside everything else."
    }
  },
  {
    "id": "granite41-8b",
    "name": "Granite 4.1 8B Instruct",
    "author": "IBM",
    "origin": "us",
    "params": 8,
    "family": "Granite",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "code",
      "multilingual"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 5,
      "q5": 6,
      "q8": 9,
      "fp16": 16
    },
    "ramCpu": 10,
    "tokSec": {
      "low": 12,
      "mid": 35,
      "high": 90
    },
    "desc": "IBM's dense 8B Granite 4.1 release: Apache 2.0, 12 languages, 131k context, MMLU 73.84, HumanEval 85.37. Trained on a CoreWeave GB200 NVL72 cluster.",
    "best": [
      "Enterprise deployments needing Apache 2.0 and IBM provenance",
      "Tool-calling agents with predictable behavior at moderate scale",
      "Multilingual products across 12 languages including French",
      "Long-context tasks up to 131k tokens",
      "Coding workloads on a single mid-range GPU"
    ],
    "_en_extras": {
      "strengths": [
        "Apache 2.0 with full transparency on training",
        "Strong tool calling and instruction following",
        "12 native languages including French",
        "131k context window",
        "Excellent quality-per-parameter at the 8B tier"
      ],
      "weaknesses": [
        "No official Ollama tag at release",
        "Reasoning in non-English languages still trails English",
        "No MoE variant at this size"
      ],
      "verdict": "IBM's most usable open model yet — Apache 2.0, multilingual, and well-suited for enterprise tool use."
    }
  },
  {
    "id": "nemotron-omni-30b",
    "name": "Nemotron 3 Nano Omni 30B-A3B",
    "author": "NVIDIA",
    "origin": "us",
    "params": 30,
    "family": "Nemotron",
    "license": "NVIDIA Open Model License",
    "tags": [
      "chat",
      "vision",
      "audio",
      "reasoning",
      "moe"
    ],
    "ctx": 256000,
    "vram": {
      "q4": 21,
      "q5": 25,
      "q8": 33,
      "fp16": 62
    },
    "ramCpu": 36,
    "tokSec": {
      "low": 15,
      "mid": 40,
      "high": 100
    },
    "desc": "NVIDIA's omnimodal MoE: 30B total / 3B active, handling text, image, audio, and video in 256k context. Hybrid Mamba2-MoE architecture delivers 9x the throughput of competing open omni models. Released April 2026.",
    "best": [
      "High-throughput omnimodal inference on NVIDIA hardware",
      "Single-GPU deployments needing text + image + audio + video",
      "Long-context multimodal analysis (256k)",
      "Production pipelines built on NVIDIA NIM",
      "English-only voice and video assistants"
    ],
    "_en_extras": {
      "strengths": [
        "Native omnimodal: text, image, audio, video",
        "256k context window",
        "9x throughput versus other open omni models",
        "Runs on a single GPU thanks to 3B active MoE",
        "First-class NVIDIA NIM pipeline"
      ],
      "weaknesses": [
        "English-only",
        "Full multimodal requires llama.cpp or vLLM (Ollama is text-only)",
        "NVIDIA Open Model License is not Apache or MIT"
      ],
      "verdict": "The fastest open omnimodal model on a single GPU — as long as you only need English."
    }
  },
  {
    "id": "laguna-xs2",
    "name": "Laguna XS.2",
    "author": "Poolside",
    "origin": "us",
    "params": 33,
    "family": "Laguna",
    "license": "Apache 2.0",
    "tags": [
      "code",
      "moe"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 19,
      "q5": 23,
      "q8": 35,
      "fp16": 66
    },
    "ramCpu": 36,
    "tokSec": {
      "low": 15,
      "mid": 40,
      "high": 100
    },
    "desc": "Poolside's first open-weight release: a 33B MoE (3B active) under Apache 2.0 built specifically for agentic coding. Scores 68.2% on SWE-Bench Verified and runs on a 36 GB Mac.",
    "best": [
      "Local coding agents on developer laptops (Mac 36 GB or similar)",
      "Apache 2.0 commercial coding assistants",
      "Agentic workflows needing native tool calls and streaming",
      "Frontier-grade SWE-Bench scores without datacenter hardware",
      "Replacing closed coding APIs with a self-hosted alternative"
    ],
    "_en_extras": {
      "strengths": [
        "68.2% SWE-Bench Verified — top-tier among open models",
        "Runs on a 36 GB Mac",
        "Apache 2.0 with no commercial restrictions",
        "Native tool calls and streaming",
        "Official Ollama tag with multiple quantizations"
      ],
      "weaknesses": [
        "Coding-specialized — not a general chat model",
        "MoE + SWA architecture needs transformers v5.6.2 or newer",
        "Interleaved thinking can slow first-token latency"
      ],
      "verdict": "The strongest open coding model that actually fits on a developer laptop — Apache 2.0 to boot."
    }
  },
  {
    "id": "granite41-30b",
    "name": "Granite 4.1 30B Instruct",
    "author": "IBM",
    "origin": "us",
    "params": 30,
    "family": "Granite",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "code",
      "multilingual"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 17,
      "q5": 21,
      "q8": 32,
      "fp16": 60
    },
    "ramCpu": 36,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 30
    },
    "desc": "IBM's dense 30B Granite 4.1: Apache 2.0, 12 languages, 131k context, with OpenAI-compatible tool calling. Built on the same GB200 NVL72 cluster as the rest of the 4.1 lineup.",
    "best": [
      "Enterprise agents requiring OpenAI-compatible function calling",
      "Apache 2.0 deployments where Granite 8B isn't enough",
      "Multilingual products across 12 languages including French",
      "Long-context workflows up to 131k",
      "Single-GPU production on RTX 5090 or A100 class hardware"
    ],
    "_en_extras": {
      "strengths": [
        "Apache 2.0 with IBM-grade transparency",
        "Native OpenAI function-calling schema",
        "12 languages including French",
        "131k context window",
        "Official Ollama tag with multiple quantizations"
      ],
      "weaknesses": [
        "Needs ~32 GB VRAM at Q4 — RTX 5090 territory",
        "No MoE variant at this size",
        "Non-English reasoning trails English"
      ],
      "verdict": "The Granite to pick when 8B feels light: Apache 2.0, function-calling native, and built for enterprise."
    }
  },
  {
    "id": "granite41-3b",
    "name": "Granite 4.1 3B Instruct",
    "author": "IBM",
    "origin": "us",
    "params": 3,
    "family": "Granite",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "code",
      "multilingual",
      "small"
    ],
    "ctx": 131072,
    "vram": {
      "q4": 2,
      "q5": 2.5,
      "q8": 3,
      "fp16": 6
    },
    "ramCpu": 5,
    "tokSec": {
      "low": 25,
      "mid": 70,
      "high": 160
    },
    "desc": "IBM's dense 3B Granite 4.1: Apache 2.0, 12 languages, 131k context, with tool calling and FIM code support. The smallest Granite tier, sharing data and pipeline with its larger siblings.",
    "best": [
      "Edge and embedded deployments needing ~3 GB VRAM",
      "Code completion with fill-in-the-middle",
      "Tool-calling agents on resource-constrained hardware",
      "Multilingual apps across 12 languages",
      "Long-context tasks at very small scale (131k)"
    ],
    "_en_extras": {
      "strengths": [
        "Apache 2.0 with full openness",
        "Tool calling plus FIM code completion",
        "12 languages including French",
        "131k context at only 3B params",
        "Fits in 3 GB VRAM"
      ],
      "weaknesses": [
        "Reasoning lags the 8B and 30B siblings",
        "Demanding chat use cases really want the 8B model"
      ],
      "verdict": "A serious 3B option for edge and embedded — same Granite recipe, just smaller."
    }
  },
  {
    "id": "ling-26-1t",
    "name": "Ling 2.6 1T",
    "author": "Ant Group / inclusionAI",
    "origin": "cn",
    "params": 1000,
    "family": "Ling",
    "license": "MIT",
    "tags": [
      "chat",
      "general",
      "moe",
      "multilingual"
    ],
    "ctx": 262144,
    "vram": {
      "q4": 580,
      "q5": 710,
      "q8": 1070,
      "fp16": 2000
    },
    "ramCpu": 700,
    "tokSec": {
      "low": 1,
      "mid": 4,
      "high": 12
    },
    "desc": "Ant Group's Ling 2.6 1T: MIT-licensed MoE with 50B active params, hybrid MLA + Linear Attention, and 256k context. Top open non-reasoning model with an Intelligence Index of 34.",
    "best": [
      "Agentic workloads needing mature tool calling at frontier scale",
      "Long-context analysis up to 256k tokens",
      "MIT-licensed datacenter deployments",
      "Non-reasoning workloads where speed beats deliberation",
      "Replacing closed flagships with open weights"
    ],
    "_en_extras": {
      "strengths": [
        "Permissive MIT license",
        "Top open non-reasoning Intelligence Index (34)",
        "256k context window",
        "Efficient hybrid MLA + Linear Attention",
        "Mature agentic tool calling, compatible with Qwen2.5 parsers"
      ],
      "weaknesses": [
        "Around 600 GB VRAM in Q4 — datacenter required",
        "Hugging Face weights only — no Ollama tag",
        "Not a reasoning model; pick DeepSeek V4 for deliberation"
      ],
      "verdict": "The MIT-licensed flagship to beat for non-reasoning, agentic workloads at trillion-parameter scale."
    }
  },
  {
    "id": "gemma4-e2b",
    "name": "Gemma 4 E2B",
    "author": "Google",
    "origin": "us",
    "params": 2,
    "family": "Gemma",
    "license": "Gemma",
    "tags": [
      "chat",
      "vision",
      "small",
      "multilingual",
      "reasoning"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 7,
      "q5": 9,
      "q8": 13,
      "fp16": 25
    },
    "ramCpu": 9,
    "tokSec": {
      "low": 20,
      "mid": 55,
      "high": 130
    },
    "desc": "Google's edge-optimized Gemma 4: 2B effective params, full text + image multimodal, 128k context, and a configurable thinking mode. Built for laptops, mobile, and CPU inference.",
    "best": [
      "On-device multimodal apps on laptops and phones",
      "CPU or low-end GPU inference at ~7 GB Q4",
      "Long-context tasks up to 128k at edge scale",
      "Quick-toggle thinking mode for harder prompts",
      "140+ language coverage in a tiny footprint"
    ],
    "_en_extras": {
      "strengths": [
        "Full multimodal in ~7 GB at Q4",
        "Runs on CPU or entry-level GPU",
        "128k context",
        "Thinking mode toggle",
        "Open Gemma license"
      ],
      "weaknesses": [
        "Quality trails the E4B and 26B variants",
        "Reasoning benchmarks well below larger models",
        "Gemma license isn't Apache or MIT"
      ],
      "verdict": "The Gemma 4 to pick when you're shipping on-device — small, multimodal, and surprisingly long-context."
    }
  },
  {
    "id": "nemotron-cascade-2",
    "name": "Nemotron Cascade 2 30B-A3B",
    "author": "NVIDIA",
    "origin": "us",
    "params": 30,
    "family": "Nemotron",
    "license": "NVIDIA Open Model License",
    "tags": [
      "chat",
      "code",
      "reasoning",
      "moe"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 17,
      "q5": 21,
      "q8": 32,
      "fp16": 60
    },
    "ramCpu": 39,
    "tokSec": {
      "low": 8,
      "mid": 30,
      "high": 80
    },
    "desc": "NVIDIA's 30B MoE (3B active) with both thinking and instruct modes. Earned IMO 2025 and IOI 2025 gold medals — 30B-class reasoning at 3B-active inference speed. Released April 2026.",
    "best": [
      "Competition-grade math and code workloads",
      "Reasoning agents needing fast inference (3B active)",
      "Single-GPU deployments on 24 GB cards in Q4",
      "Production systems on NVIDIA Open Model License terms",
      "Tasks switching between thinking and instruct modes"
    ],
    "_en_extras": {
      "strengths": [
        "Gold medal at IMO 2025 and IOI 2025 in thinking mode",
        "Fast inference with only 3B active params",
        "Fits on a 24 GB GPU at Q4",
        "Commercial use allowed under NVIDIA Open Model License"
      ],
      "weaknesses": [
        "NVIDIA Open Model License — not Apache or MIT",
        "32+ GB VRAM total in Q4 (full model is 30B)",
        "Thinking mode generation can be slow"
      ],
      "verdict": "Olympic-grade reasoning at 3B-active inference cost — the sharpest open math and code model in its weight class."
    }
  },
  {
    "id": "nemotron3",
    "name": "Nemotron 3 33B",
    "author": "NVIDIA",
    "origin": "us",
    "params": 33,
    "family": "Nemotron",
    "license": "NVIDIA Open Model License",
    "tags": [
      "chat",
      "code",
      "reasoning"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 19,
      "q5": 23,
      "q8": 35,
      "fp16": 66
    },
    "ramCpu": 43,
    "tokSec": {
      "low": 3,
      "mid": 12,
      "high": 30
    },
    "desc": "NVIDIA's dense 33B model targeting balanced chat, code, and reasoning workloads. Fits a single RTX 4090 at Q4 with a 128k context window.",
    "best": [
      "Single-GPU local deployment on a 24GB card (RTX 4090/3090) at Q4",
      "Mixed workloads spanning chat, code generation, and step-by-step reasoning",
      "Long-document analysis up to 128k tokens",
      "Self-hosted alternative to mid-tier API models when data must stay on-prem"
    ],
    "_en_extras": {
      "strengths": [
        "Dense 33B sized to saturate a 24GB consumer GPU at Q4",
        "128k context handles long codebases and reports",
        "RLHF tuned for reasoning and code, not just chat",
        "Open weights backed by NVIDIA's research stack"
      ],
      "weaknesses": [
        "NVIDIA Open Model License has commercial terms worth reviewing carefully",
        "Gated on Hugging Face (click-through access required)",
        "Dense 33B is heavier than comparable MoE alternatives at inference"
      ],
      "verdict": "A solid single-GPU workhorse for teams that want strong reasoning and code on a 4090 without depending on an API."
    }
  },
  {
    "id": "nemotron-3-nano",
    "name": "Nemotron 3 Nano 30B-A3B",
    "author": "NVIDIA",
    "origin": "us",
    "params": 30,
    "family": "Nemotron",
    "license": "NVIDIA Open Model License",
    "tags": [
      "chat",
      "code",
      "reasoning",
      "moe"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 17,
      "q5": 21,
      "q8": 32,
      "fp16": 60
    },
    "ramCpu": 39,
    "tokSec": {
      "low": 8,
      "mid": 30,
      "high": 80
    },
    "desc": "NVIDIA's 30B-parameter MoE with only 3.5B active per token, delivering 30B-class quality at small-model speeds across chat, code, and reasoning. 128k context.",
    "best": [
      "Throughput-sensitive serving where latency matters more than peak quality",
      "Local inference with partial CPU offload (around 39GB system RAM)",
      "Long-context reasoning and coding without paying dense-model compute",
      "Workloads that previously needed a dense 30B but were too slow"
    ],
    "_en_extras": {
      "strengths": [
        "MoE routing yields 3.5B-class latency with 30B-class capability",
        "128k context for large documents and repos",
        "Strong across chat, code, and reasoning in one checkpoint",
        "Distillation plus RL alignment from the broader Nemotron family"
      ],
      "weaknesses": [
        "Needs ~39GB system RAM when partially offloaded to CPU",
        "NVIDIA Open Model License — review commercial terms",
        "Gated on Hugging Face"
      ],
      "verdict": "The fast lane of the Nemotron 3 family — pick it when you want 30B output quality but can't afford 30B latency."
    }
  },
  {
    "id": "medgemma",
    "name": "MedGemma 4B",
    "author": "Google",
    "origin": "us",
    "params": 4,
    "family": "Gemma",
    "license": "Gemma",
    "tags": [
      "chat",
      "vision",
      "multilingual",
      "small"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 2.3,
      "q5": 2.8,
      "q8": 4.3,
      "fp16": 8
    },
    "ramCpu": 5,
    "tokSec": {
      "low": 25,
      "mid": 70,
      "high": 150
    },
    "desc": "Google's 4B medical variant of Gemma with vision and text, tuned for radiology, clinical imaging, and report drafting. 128k context, Gemma license.",
    "best": [
      "Drafting or summarizing radiology and clinical reports",
      "Prototyping medical imaging assistants on a single small GPU",
      "Research into multimodal clinical NLP without API costs",
      "Edge deployments in healthcare workflows where data can't leave the device"
    ],
    "_en_extras": {
      "strengths": [
        "Domain-tuned on clinical literature and radiology imagery",
        "Compact 4B footprint (~2.3GB VRAM at Q4)",
        "True multimodal — text plus medical images",
        "Permissive Gemma license for research and most commercial use"
      ],
      "weaknesses": [
        "Decision-support only — not approved for direct clinical use",
        "Narrow specialization; weak outside medical contexts",
        "Gated on Hugging Face"
      ],
      "verdict": "A pocket-sized clinical assistant for research and report drafting — never a substitute for a licensed clinician."
    }
  },
  {
    "id": "gemma4",
    "name": "Gemma 4 2B",
    "author": "Google",
    "origin": "us",
    "params": 2,
    "family": "Gemma",
    "license": "Gemma",
    "tags": [
      "chat",
      "vision",
      "multilingual",
      "small"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 1.2,
      "q5": 1.4,
      "q8": 2.1,
      "fp16": 4
    },
    "ramCpu": 2.6,
    "tokSec": {
      "low": 35,
      "mid": 100,
      "high": 200
    },
    "desc": "Google's 2B base model in the Gemma 4 family with text and image input, 128k context, and a 1.2GB Q4 footprint that runs on integrated graphics or a Raspberry Pi 5.",
    "best": [
      "On-device assistants for laptops, phones, and SBCs",
      "Multimodal prototypes that can't justify a dedicated GPU",
      "Long-context summarization at the edge",
      "Air-gapped or offline scenarios where latency and privacy matter"
    ],
    "_en_extras": {
      "strengths": [
        "Runs on integrated GPUs at ~1.2GB VRAM in Q4",
        "Multimodal text and image input out of the box",
        "128k context unusual at this parameter count",
        "Permissive Gemma license"
      ],
      "weaknesses": [
        "Reasoning lags behind 4B and larger Gemma variants",
        "Gated on Hugging Face (click-through access)"
      ],
      "verdict": "The smallest Gemma 4 that still feels useful — a strong default for edge multimodal apps."
    }
  },
  {
    "id": "qwen3-5",
    "name": "Qwen 3.5 0.8B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 0.8,
    "family": "Qwen",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "general",
      "small",
      "multilingual"
    ],
    "ctx": 256000,
    "vram": {
      "q4": 0.5,
      "q5": 0.6,
      "q8": 0.9,
      "fp16": 1.6
    },
    "ramCpu": 2,
    "tokSec": {
      "low": 110,
      "mid": 170,
      "high": 220
    },
    "desc": "Alibaba's ultra-compact 0.8B chat model with a 256k context window and a sub-1GB Q4 footprint, Apache 2.0 on Ollama. Runs on CPUs, integrated GPUs, and Raspberry Pi.",
    "best": [
      "Embedded assistants on phones, SBCs, and microcontrollers with NPUs",
      "Cheap classification, routing, or instruction-following at scale",
      "Offline chat where memory and power budgets are tight",
      "Long-context retrieval scenarios that don't need deep reasoning"
    ],
    "_en_extras": {
      "strengths": [
        "Negligible memory footprint — under 1GB at Q4",
        "256k context, rare at this size",
        "Apache 2.0 distribution via Ollama",
        "Runs comfortably on CPU, integrated GPU, or Raspberry Pi"
      ],
      "weaknesses": [
        "Reasoning quality is inherently limited at 0.8B",
        "Text-only — no vision capability",
        "Hugging Face distribution uses the Qwen license rather than Apache"
      ],
      "verdict": "The right pick when you need a real LLM in under a gigabyte and don't need it to think hard."
    }
  },
  {
    "id": "medgemma1-5",
    "name": "MedGemma 1.5 4B",
    "author": "Google",
    "origin": "us",
    "params": 4,
    "family": "Gemma",
    "license": "Gemma",
    "tags": [
      "chat",
      "vision",
      "multilingual",
      "small"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 2.3,
      "q5": 2.8,
      "q8": 4.3,
      "fp16": 8
    },
    "ramCpu": 5,
    "tokSec": {
      "low": 25,
      "mid": 70,
      "high": 150
    },
    "desc": "Google's v1.5 update to MedGemma — a 4B vision-and-text model fine-tuned on clinical literature, radiology imagery, and medical reports. 128k context, Gemma license.",
    "best": [
      "Upgrading existing MedGemma 1.0 deployments without re-architecting",
      "Drafting and summarizing clinical reports with image grounding",
      "Research workflows in radiology and medical imaging",
      "On-prem clinical assistants where API calls aren't an option"
    ],
    "_en_extras": {
      "strengths": [
        "Iterative refinement over MedGemma 1.0 with the same footprint",
        "Compact 4B (~2.3GB VRAM at Q4)",
        "Multimodal — text plus medical imagery",
        "128k context for long patient histories and literature"
      ],
      "weaknesses": [
        "Decision-support tool only — not for direct clinical use",
        "Narrow medical focus, weak general performance",
        "Gated on Hugging Face"
      ],
      "verdict": "A drop-in upgrade to MedGemma 1.0 with sharper clinical performance at the same compact size."
    }
  },
  {
    "id": "granite4-1",
    "name": "Granite 4.1",
    "author": "IBM",
    "origin": "us",
    "params": 3,
    "family": "Granite",
    "license": "Apache 2.0",
    "tags": [
      "chat",
      "code"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 1.7,
      "q5": 2.1,
      "q8": 3.2,
      "fp16": 6
    },
    "ramCpu": 3.9,
    "tokSec": {
      "low": 50,
      "mid": 85,
      "high": 130
    },
    "desc": "IBM's Granite 4.1 in its generic 3B Ollama tag — Apache 2.0, 128k context, robust tool calling, and a sub-2GB Q4 footprint. Code- and chat-oriented.",
    "best": [
      "Agent and tool-use pipelines where license clarity matters",
      "Enterprise deployments that require fully permissive Apache 2.0 weights",
      "Multilingual chat across the 12 languages IBM trained on",
      "Lightweight coding assistants on developer machines"
    ],
    "_en_extras": {
      "strengths": [
        "Fully Apache 2.0 — no click-through, no commercial caveats",
        "128k context window",
        "OpenAI-compatible tool calling that works reliably",
        "Compact ~1.7GB VRAM at Q4"
      ],
      "weaknesses": [
        "Hugging Face distribution is gated even though the license is open",
        "Generic Ollama tag doesn't pin a specific size variant"
      ],
      "verdict": "The pragmatic Apache 2.0 default for agentic workflows when license friction is a non-starter."
    }
  },
  {
    "id": "qwen3-6",
    "name": "Qwen 3.6 27B",
    "author": "Alibaba",
    "origin": "cn",
    "params": 27,
    "family": "Qwen",
    "license": "Qwen License",
    "tags": [
      "chat",
      "code",
      "reasoning",
      "vision",
      "multilingual"
    ],
    "ctx": 256000,
    "vram": {
      "q4": 16,
      "q5": 19,
      "q8": 29,
      "fp16": 54
    },
    "ramCpu": 35,
    "tokSec": {
      "low": 9,
      "mid": 14,
      "high": 22
    },
    "desc": "Alibaba's Qwen 3.6 27B — multimodal vision and text with a native 256k context, tuned for multilingual reasoning and code. Fits a 16GB GPU at Q4.",
    "best": [
      "Multilingual code generation across mixed-language codebases",
      "Long-context document analysis up to 256k tokens",
      "Vision-grounded reasoning over screenshots, diagrams, and PDFs",
      "Self-hosted alternative to commercial multimodal APIs"
    ],
    "_en_extras": {
      "strengths": [
        "Native 256k context handles entire repos and long PDFs",
        "Genuinely multimodal — vision plus text",
        "Strong multilingual code performance",
        "Reasoning sharpened over earlier Qwen generations"
      ],
      "weaknesses": [
        "Qwen License — not strictly Apache, review terms",
        "Needs ~16GB VRAM at Q4",
        "Gated on Hugging Face"
      ],
      "verdict": "Qwen's most capable mid-size open model — a strong multimodal pick for a single 16GB+ GPU."
    }
  },
  {
    "id": "lfm2-5-thinking",
    "name": "LFM2.5 Thinking 1.2B",
    "author": "Liquid AI",
    "origin": "us",
    "params": 1.2,
    "family": "LFM",
    "license": "LFM Open License v1.0",
    "tags": [
      "chat",
      "general",
      "reasoning",
      "small"
    ],
    "ctx": 32768,
    "vram": {
      "q4": 0.7,
      "q5": 0.9,
      "q8": 1.3,
      "fp16": 2.4
    },
    "ramCpu": 1.6,
    "tokSec": {
      "low": 110,
      "mid": 170,
      "high": 220
    },
    "desc": "Liquid AI's 1.2B reasoning variant with an explicit thinking mode, sub-1GB Q4 footprint, and CPU/iGPU-friendly inference. 32k context.",
    "best": [
      "On-device reasoning on laptops and SBCs without a discrete GPU",
      "Latency-sensitive tasks that still benefit from chain-of-thought",
      "Edge agents where memory budget rules out larger models",
      "Privacy-first deployments that must stay fully local"
    ],
    "_en_extras": {
      "strengths": [
        "Negligible memory footprint — under 1GB at Q4",
        "Runs comfortably on CPU and integrated GPUs",
        "Explicit thinking mode for visible chain-of-thought",
        "Low-latency inference suitable for interactive use"
      ],
      "weaknesses": [
        "1.2B parameters cap absolute capability",
        "32k context is short by 2026 standards",
        "LFM Open License rather than pure Apache"
      ],
      "verdict": "The most capable sub-2B reasoning model that still fits comfortably on a CPU-only laptop."
    }
  },
  {
    "id": "glm-4-7-flash",
    "name": "GLM 4.7 Flash",
    "author": "Zhipu AI",
    "origin": "cn",
    "params": 3,
    "family": "GLM",
    "license": "MIT",
    "tags": [
      "chat",
      "multilingual"
    ],
    "ctx": 128000,
    "vram": {
      "q4": 1.7,
      "q5": 2.1,
      "q8": 3.2,
      "fp16": 6
    },
    "ramCpu": 3.9,
    "tokSec": {
      "low": 50,
      "mid": 85,
      "high": 130
    },
    "desc": "Zhipu AI's compact 3B variant of GLM 4.7, MIT-licensed with a 128k context. Optimized for low-latency bilingual Chinese-English chat.",
    "best": [
      "Bilingual zh/en chat assistants where latency is critical",
      "Lightweight chat backends with a strict permissive license requirement",
      "Long-context summarization on small GPUs",
      "Cost-sensitive serving at scale where 30B variants are overkill"
    ],
    "_en_extras": {
      "strengths": [
        "MIT license — among the most permissive in the open ecosystem",
        "128k context in a 3B footprint",
        "Strong Chinese and English performance",
        "Compact ~1.7GB VRAM at Q4"
      ],
      "weaknesses": [
        "Gated on Hugging Face despite the open license",
        "Less versatile than the 30B GLM 4.7 variants"
      ],
      "verdict": "MIT-licensed, fast, and bilingual — the GLM 4.7 to reach for when you need throughput over peak capability."
    }
  }
]