format | 格式化 | Python dict、列表 |
模型后端
# Transformers(本地)
model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
# llama.cpp
model = outlines.models.llamacpp("path/to/model.gguf")
# vLLM
model = outlines.models.vllm("meta-llama/Llama-3-8B-Instruct")
# OpenAI
model = outlines.models.openai("gpt-4")
词汇引导(Token 目视)
Outlines 通过在生成时限制 token 采样词汇(而非生成后校验),实现零开销结构化生成。
- 不浪费样本在无效 token 上
- 保证输出格式正确
- 比 ReAct/自研解析器更快
进阶用法
带验证的 Pydantic
from pydantic import BaseModel, field_validator
import outlines
class User(BaseModel):
name: str
age: int
email: str
@field_validator('age')
@classmethod
def age_must_be_positive(cls, v):
if v <= 0:
raise ValueError('Age must be positive')
return v
model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
generator = outlines.generate.json(model, User)
result = generator("Create a user named Bob, age 25, email bob@test.com")
列表生成
import outlines
model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
# 生成字符串列表(逗号分隔)
list_gen = outlines.generate.format(model, str)
items = list_gen("List three fruits: apple, ")
print(items) # "banana, orange"
# 生成带格式的列表
generator = outlines.generate.json(model, {"type": "array", "items": {"type": "string"}})
result = generator("List three colors")
函数调用
from pydantic import BaseModel
import outlines
class ExtractInfo(BaseModel):
name: str
organization: str | None = None
title: str | None = None
model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
generator = outlines.generate.json(model, ExtractInfo)
text = "John Smith is the CEO of Acme Corp."
result = generator(f"Extract: {text}")
# {"name": "John Smith", "organization": "Acme Corp.", "title": "CEO"}
XML 生成
xml_schema = """<{name:str}{age:int}{email:str}"""
generator = outlines.generate.format(model, xml_schema)
result = generator("Generate user data in XML format.")
常见工作流
工作流 1:RAG 提取
from pydantic import BaseModel
import outlines
class DocumentMetadata(BaseModel):
title: str
author: str | None
date: str | None
summary: str
model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
generator = outlines.generate.json(model, DocumentMetadata)
text = "The paper 'Attention Is All You Need' by Vaswani et al. (2017) introduced the Transformer architecture."
result = generator(f"Extract metadata from: {text}")
# {"title": "Attention Is All You Need", "author": "Vaswani et al.", "date": "2017", "summary": "..."}
工作流 2:批量分类
from typing import Literal
import outlines
Sentiment = Literal["positive", "negative", "neutral"]
model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
generator = outlines.generate.choice(model, ["positive", "negative", "neutral"])
reviews = [
"这个产品非常好用!",
"太差了,完全不好用。",
"一般般,中规中矩。"
]
for review in reviews:
sentiment = generator(f"情感分析: {review}")
print(f"'{review}' -> {sentiment}")
工作流 3:代码生成
import outlines
model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
# 生成 Python 函数格式
py_schema = """@def greet(name: str, age: int) -> str:
return"""
generator = outlines.generate.format(model, py_schema)
code = generator("Write a greet function.")
print(code)
# @def greet(name: str, age: int) -> str:
# return f"Hello, {name}! You are {age} years old."
配置参数
采样参数
import outlines
model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
# 自定义采样
result = outlines.generate.choice(
model,
["positive", "negative", "neutral"],
sampler=outlines.samplers.TopK(k=50)
)("This movie is amazing!")
# 贪婪采样
result = outlines.generate.choice(
model,
["positive", "negative", "neutral"],
sampler=outlines.samplers.Greedy()
)("This movie is amazing!")
常见问题
| 问题 | 解决方案 |
| 生成进入死循环 | 确保 Schema/Regex 完整,无歧义分支 |
| 输出不完全的 JSON | 设置 max_tokens 确保完整生成 |
| 不支持的模型 | 检查 outlines 版本,使用 transformers 后端 |
| Pydantic 验证失败 | 添加 field_validator 或使用 nullable 字段 |
资源链接
- GitHub: https://github.com/dottxt-ai/outlines
- 文档: https://dottxt-ai.github.io/outlines/
- Pydantic 集成: https://dottxt-ai.github.io/outlines/reference/pydantic/
- 正则语法: https://dottxt-ai.github.io/outlines/reference/regex/