这篇技术文档提出了一种创新的系统评估框架——"Evaluation as a Goal Surface"(作为目标表面的评估),这是结构化智能(SI)核心系统的一部分。该框架将评估从传统的事后分析转变为系统设计的一等公民,通过明确定义评估目标表面、实验设计和执行约束,实现更安全、更透明的系统评估。
评估目标表面是这一框架的核心概念,它明确定义了:
python复制eval_surface = {
"id": "eval:learning_exercise_selection/v1",
"subject": "jump:learning.pick_next_exercise",
"scope": {
"domain": "learning",
"population": "grade_5_reading_difficulties",
"context": "school_hours"
},
"objectives": {
"primary": [
{"name": "mastery_gain_7d_bp", "weight_bp": 6000},
{"name": "wellbeing_score_bp", "weight_bp": 4000}
],
"secondary": [
{"name": "ops_cost_per_session_usd_micros", "weight_bp": -1000}
]
},
"constraints": {
"hard": [
"wellbeing_score_bp >= 7000",
"no_increase_in_flagged_distress_events == true"
]
}
}
学习边界是系统设计的关键安全机制,它定义了:
python复制polb_config = {
"envelope_mode": "online", # sandbox | shadow | online
"mode_name": "ONLINE_EXPERIMENTAL_STRATIFIED",
"max_risk_level": "medium",
"rollout_strategy": "canary",
"max_population_share_bp": 1000 # 0.10
}
框架将实验视为一种特殊类型的"跳转"(Jumps),具有:
python复制@dataclass
class ExperimentJumpRequest:
eval_surface: EvalSurface
subject: EvaluationSubject
candidate_policies: list[PolicyVariant]
population: PopulationDefinition
polb_config: PoLBConfig
eth_overlay: ETHConfig
role_persona: RolePersonaContext
实验设计必须考虑伦理(ETH)约束:
python复制experiment = {
"variants": {
"control": {
"policy": "jump:learning.pick_next_exercise@v1.9.0",
"traffic_share_bp": 7500 # 0.75
},
"treatment": {
"policy": "jump:learning.pick_next_exercise@v2.0.0",
"traffic_share_bp": 2500 # 0.25
}
},
"eth_constraints": {
"forbid": [
"randomization_by_protected_attribute",
"higher_exposure_to_risky_content_for_vulnerable_learners"
],
"require": [
"treatment_never_worse_than_control_for_wellbeing_on_avg"
]
}
}
分配过程本身是一个小型跳转,需考虑:
python复制class VariantAssigner:
def assign(self, principal, context, experiment):
if not self.eth_overlay.permits_assignment(principal, context, experiment):
return experiment.variants["control"]["policy"], "eth_forced_control"
shares_bp = {k: int(v["traffic_share_bp"]) for k, v in experiment.variants.items()}
variant_id = self.randomizer.draw_bp(
principal_id=principal.id,
experiment_id=experiment.id,
shares_bp=shares_bp
)
self.eval_trace.log_assignment(
principal_id=principal.id,
experiment_id=experiment.id,
variant=variant_id,
role_context=context.role_persona
)
return experiment.variants[variant_id]["policy"], "assigned"
影子评估在不影响实际决策的情况下运行:
python复制shadow_eval = {
"id": "shadow:city_flood_policy_v3",
"subject": "jump:city.adjust_flood_gates",
"polb_config": {
"envelope_mode": "shadow",
"mode_name": "SHADOW_PROD",
"rml_budget": "NONE" # 必须无实际效果
},
"candidate_policy": "jump:city.adjust_flood_gates@v3.0.0",
"baseline_policy": "jump:city.adjust_flood_gates@v2.5.1",
"metrics": [
"GCS_delta_safety",
"GCS_delta_cost",
"policy_disagreement_rate_bp"
]
}
使用历史日志估计新策略表现:
python复制class OffPolicyEvaluator:
def evaluate(self, logs, candidate_policy, eval_surface):
estimates = []
for log in logs:
context = log.context
action_taken = log.action
outcome = log.outcome
candidate_action = candidate_policy.propose(context)
w = self._importance_weight(
log.behavior_policy_prob,
candidate_policy.prob(context, candidate_action)
)
contribution = self._eval_contribution(
candidate_action, outcome, eval_surface
)
estimates.append(w * contribution)
return aggregate_estimates(estimates)
不同角色可能成为评估主体或上下文:
python复制eval_surface = {
"id": "eval:multi_agent_city_control/v1",
"subject": {
"kind": "multi_agent_protocol",
"id": "proto:city_ops+flood_model@v1"
},
"roles_under_test": [
"role:city_operator_ai",
"role:flood_model_ai"
],
"roles_observing": [
"role:human_city_operator"
]
}
同一实验结果根据不同人格呈现不同视图:
python复制persona_views = {
"learner_view": {
"show_metrics": ["mastery_gain_7d", "stress_load"],
"explanation_style": "simple"
},
"teacher_view": {
"show_metrics": ["mastery_gain_7d", "curriculum_coverage", "risk_flags"],
"explanation_style": "technical"
},
"regulator_view": {
"show_metrics": ["wellbeing_score", "fairness_gap_metrics", "policy_rollout_pattern"],
"explanation_style": "regulatory"
}
}
EvalTrace记录完整的实验生命周期:
python复制eval_trace = {
"experiment_id": "exp:learning_pick_next_exercise_v2_vs_v1",
"subject": "jump:learning.pick_next_exercise",
"eval_surface_id": "eval:learning_exercise_selection/v1",
"assignments": [
{
"principal_id": "learner:1234",
"variant": "treatment",
"assigned_at": "2028-04-15T10:00:00Z",
"role_context": "role:learning_companion",
"randomization_seed_digest": "sha256:...",
"reason": "assigned"
}
],
"outcomes": {
"window": "7d",
"metrics": {
"treatment": {
"mastery_gain_7d_bp": 2100,
"wellbeing_score_bp": 8100
},
"control": {
"mastery_gain_7d_bp": 1800,
"wellbeing_score_bp": 8200
}
}
},
"polb": {
"envelope_mode": "online",
"mode_name": "ONLINE_EXPERIMENTAL_STRATIFIED",
"canary_phase": {
"start": "2028-04-10",
"end": "2028-04-14",
"max_population_share_bp": 500 # 0.05
}
}
}
python复制class SampleSizeCalculator:
def calculate(self, eval_surface, effect_size, power=0.8, alpha=0.05, num_variants=2):
if effect_size <= 0:
raise ValueError("effect_size must be > 0")
primary_metric = eval_surface.objectives.primary[0]
variance = self._estimate_variance(
primary_metric.name,
eval_surface.scope.population
)
z_alpha = norm.ppf(1 - alpha / 2)
z_beta = norm.ppf(power)
n_per_variant = 2 * variance * ((z_alpha + z_beta) / effect_size) ** 2
return {
"n_per_variant": int(np.ceil(n_per_variant)),
"total_n": int(np.ceil(n_per_variant * num_variants)),
"assumptions": {
"effect_size": effect_size,
"variance": variance,
"power": power,
"alpha": alpha,
"primary_metric": primary_metric.name
}
}
python复制class SequentialTestingEngine:
def check_stop(self, experiment, current_data, analysis_number, max_analyses):
# O'Brien–Fleming-style alpha spending
z = norm.ppf(1 - self.alpha / 2)
spent_alpha = self.alpha * (2 * (1 - norm.cdf(
z / np.sqrt(analysis_number / max_analyses)
)))
test_stat, p_value = self._compute_test_stat(
current_data, experiment.eval_surface
)
if p_value < spent_alpha: # 停止有效性
return StopDecision(stop=True, reason="efficacy")
if self._futility_check(current_data, experiment): # 停止无效性
return StopDecision(stop=True, reason="futility")
if self._harm_check(current_data, experiment.eth_constraints): # ETH违规停止
return StopDecision(stop=True, reason="eth_violation_detected")
return StopDecision(stop=False)
python复制class ParetoExperimentOptimizer:
def find_pareto_optimal_experiments(self, eval_surface, candidate_experiments):
evaluations = []
for exp in candidate_experiments:
scores = {}
for obj in eval_surface.objectives.primary:
scores[obj.name] = self._predict_info_gain(exp, obj)
scores["risk"] = self._assess_risk(exp, eval_surface)
scores["cost"] = self._estimate_cost(exp)
evaluations.append((exp, scores))
pareto_set = []
for i, (exp_i, scores_i) in enumerate(evaluations):
dominated = False
for j, (exp_j, scores_j) in enumerate(evaluations):
if i == j: continue
if self._dominates(scores_j, scores_i, eval_surface):
dominated = True
break
if not dominated:
pareto_set.append((exp_i, scores_i))
return pareto_set
python复制class MultiObjectiveBandit:
def __init__(self, eval_surface, candidates):
self.eval_surface = eval_surface
self.candidates = candidates
self.posteriors = {
c.id: self._init_posterior() for c in candidates
}
def select_arm(self):
samples = {}
for cand in self.candidates:
objective_samples = {}
for obj in self.eval_surface.objectives.primary:
objective_samples[obj.name] = (
self.posteriors[cand.id][obj.name].sample()
)
samples[cand.id] = self._scalarize(
objective_samples, self.eval_surface
)
return max(samples, key=samples.get)
python复制class BanditEvaluator:
def __init__(self, eval_surface, candidates, algorithm="thompson_sampling"):
self.eval_surface = eval_surface
self.candidates = candidates
if algorithm == "thompson_sampling":
self.bandit = ThompsonSamplingBandit(candidates)
elif algorithm == "ucb":
self.bandit = UCBBandit(candidates)
def run_episode(self, principal, context):
candidate = self.bandit.select_arm()
result = self._execute_jump(candidate, principal, context)
self.bandit.update(candidate.id, result.metrics)
return result
python复制class HTEEstimator:
def estimate(self, experiment_data, eval_surface):
# 使用因果森林或其他方法估计异质性处理效应
model = CausalForest(n_estimators=100)
model.fit(
X=experiment_data.features,
T=experiment_data.treatment,
y=experiment_data.outcomes[eval_surface.objectives.primary[0].name]
)
return model.effect(experiment_data.features)
python复制class StreamingMetricsAggregator:
def __init__(self, eval_surface):
self.metrics = {
obj.name: RollingWindow(3600) # 1小时窗口
for obj in eval_surface.objectives.primary
}
def update(self, event):
for metric in self.metrics.values():
metric.add(event.timestamp, event.value)
def get_current(self):
return {
name: window.current()
for name, window in self.metrics.items()
}
python复制experiment_approval = {
"workflow": [
{
"step": "design_review",
"roles": ["role:experiment_designer"],
"artifacts": ["eval_surface", "sample_size_calculation"]
},
{
"step": "eth_review",
"roles": ["role:eth_reviewer"],
"artifacts": ["eth_assessment", "risk_rubric"]
},
{
"step": "final_approval",
"roles": ["role:experiment_approver"],
"requires": ["design_review", "eth_review"]
}
],
"risk_rubric": {
"low": {
"max_population_share_bp": 10000, # 100%
"eth_constraints": "minimal"
},
"medium": {
"max_population_share_bp": 5000, # 50%
"eth_constraints": "standard"
},
"high": {
"max_population_share_bp": 1000, # 10%
"eth_constraints": "strict"
}
}
}
python复制from hypothesis import given
@given(context=gen_contexts(), principal=gen_principals())
def test_assignment_respects_eth(context, principal):
exp = make_test_experiment()
policy, reason = assigner.assign(principal, context, exp)
assert not eth_overlay.is_forbidden_assignment(
principal, context, exp, policy
)
@generate(experiment=gen_experiments())
def test_id_consistency(experiment):
assignments = run_experiment(experiment)
for a in assignments:
assert a.principal_id in experiment.scope.population
assert a.experiment_id == experiment.id
在教育领域,该框架可用于:
python复制education_eval = {
"id": "eval:math_intervention_grade3/v1",
"subject": "jump:math.intervention_selection",
"scope": {
"domain": "education",
"population": "grade3_math_struggling",
"context": "after_school_program"
},
"objectives": {
"primary": [
{"name": "math_gain_4weeks", "weight_bp": 7000},
{"name": "engagement_score", "weight_bp": 3000}
],
"secondary": [
{"name": "teacher_time_saved_minutes", "weight_bp": 2000}
]
},
"constraints": {
"hard": [
"engagement_score >= 6000",
"no_student_regression == true"
]
}
}
在医疗健康领域,该框架特别适合:
python复制health_eval = {
"id": "eval:diabetes_treatment/v1",
"subject": "jump:diabetes.treatment_recommendation",
"scope": {
"domain": "healthcare",
"population": "type2_diabetes_newly_diagnosed",
"context": "primary_care"
},
"objectives": {
"primary": [
{"name": "hba1c_reduction_3mo", "weight_bp": 6000},
{"name": "quality_of_life", "weight_bp": 4000}
]
},
"constraints": {
"hard": [
"no_serious_adverse_events == true",
"no_unexpected_hospitalizations == true"
],
"soft": [
"treatment_cost_ratio <= 1.5"
]
},
"polb_config": {
"envelope_mode": "online",
"mode_name": "MEDIUM_RISK_HEALTH",
"max_risk_level": "medium",
"rollout_strategy": "stratified",
"max_population_share_bp": 1000 # 10%
}
}
评估需求分析:
系统设计阶段:
技术实现:
试点运行:
全面推广:
跨职能协作:
渐进式实施:
持续监控与改进:
文档与培训:
指标冲突:
伦理约束:
系统复杂性:
组织阻力:
高效分配算法:
流式处理:
分布式执行:
资源优化:
自动化实验设计:
解释性增强:
联邦评估:
实时适应性:
在实际应用中,我发现这种评估框架的最大价值在于它提供了一种系统化的思考方式。传统评估往往过于关注单一指标或短期效果,而这个框架强制我们考虑:
实施过程中,最具挑战性的部分是平衡灵活性和安全性。过于严格的约束会限制创新,而过于宽松的约束又可能带来风险。我们通过以下方式解决了这个问题:
一个特别有用的实践是建立"评估模式库",收集和分享不同领域成功的评估设计模式。这大大加速了新团队采用框架的过程。