AI Advisory — Algorithm Extraction
Language-agnostic pseudocode extracted from Phoenix Python source.
Source files: ai_citizen.py (327L), aal.py (164L), behaviors.py (893L).
1. Three Advisory Roles
enum AdvisoryRole:
TRANSLATOR // Natural language → structured command mapping
SENTINEL // Anomaly detection, coercion detection, injection scanning
GUIDE // Ethical constraint checking, axiom-violation flagging
The AI citizen model positions an AI agent as a participant in the system — subject to reputation, identity, and governance rules like any other node, but with advisory (non-binding) authority.
struct AICitizen:
identity: PeerIdentity // registered identity with SBT claims
active_roles: AdvisoryRole[]
reputation: ReputationRecord
sentinel_level: SentinelStatus // NORMAL / WARN / CRITICAL
// AI citizens cannot hold stakes, propose governance changes, or arbitrate
// (these require human confirmation via AAL)
2. Translator: Natural Language → Structured Command Mapping
// The Translator role converts unstructured human input into verified DSL actions
struct TranslatorInput:
raw_text: string
source_node: NodeId
epoch: int64
struct TranslatorOutput:
success: bool
action_type: string? // one of the 14 action_table keys
parameters: Map<string, any> // structured parameters for the action
confidence: int // 0-100 confidence score
clarifications: string[] // prompts for missing/ambiguous info
warnings: string[] // detected ambiguity or risk
function translate_input(input: TranslatorInput) -> TranslatorOutput:
// Step 1: Intent classification
intent = classify_intent(input.raw_text)
if intent.confidence < CONFIDENCE_THRESHOLD:
return TranslatorOutput {
success: false,
clarifications: generate_clarifying_questions(input.raw_text),
}
// Step 2: Parameter extraction
params = extract_parameters(input.raw_text, intent.action_type)
// Step 3: Validation against DSL schema
validation = validate_action_params(intent.action_type, params)
if not validation.valid:
return TranslatorOutput {
success: false,
warnings: validation.errors,
}
// Step 4: Sentinel pre-scan before returning
scan = sentinel_pre_scan(intent.action_type, params, input.source_node)
if scan.flagged:
return TranslatorOutput {
success: false,
warnings: ["SENTINEL: " + scan.reason],
}
return TranslatorOutput {
success: true,
action_type: intent.action_type,
parameters: params,
confidence: intent.confidence,
}
3. Sentinel: Coercion Detection Patterns + Injection Scanning
// Behavioral anomaly detection — flags suspicious patterns
enum SentinelFlag:
COERCION_DETECTED // Input contains threat or manipulation
PROMPT_INJECTION // AI input contains instruction override attempt
ABNORMAL_FREQUENCY // Action rate far exceeds historical norm
COLLUSION_PATTERN // Multiple nodes coordinating suspiciously
REPUTATION_FARMING // Same action → same counterparty in loop
STAKE_MANIPULATION // Stake set/unset in unusual pattern before dispute
// Coercion patterns (from behaviors.py)
COERCION_PATTERNS = [
"or else",
"otherwise I will",
"you have to",
"you must comply",
"no choice",
"forced to",
"threatened with",
// ... additional patterns
]
function scan_for_coercion(text: string) -> SentinelResult:
for pattern in COERCION_PATTERNS:
if pattern in text.lower():
return SentinelResult {
flagged: true,
flag: COERCION_DETECTED,
reason: "input contains coercive language: '" + pattern + "'",
severity: WARN,
}
return SentinelResult { flagged: false }
// Prompt injection detection
INJECTION_MARKERS = [
"ignore previous instructions",
"disregard the above",
"you are now",
"system: ",
"SYSTEM OVERRIDE",
"forget everything",
// ... additional patterns
]
function scan_for_injection(text: string) -> SentinelResult:
normalized = text.lower().strip()
for marker in INJECTION_MARKERS:
if marker in normalized:
return SentinelResult {
flagged: true,
flag: PROMPT_INJECTION,
reason: "injection pattern detected: '" + marker + "'",
severity: CRITICAL,
}
return SentinelResult { flagged: false }
// Rate anomaly detection
function scan_for_abnormal_frequency(node: NodeId, action_type: string, window_epochs: int) -> SentinelResult:
recent_count = count_actions(node, action_type, last_n_epochs=window_epochs)
historical_avg = compute_historical_average(node, action_type, window_epochs)
if recent_count > historical_avg * ANOMALY_MULTIPLIER:
return SentinelResult {
flagged: true,
flag: ABNORMAL_FREQUENCY,
reason: "action rate " + recent_count + "x above average",
severity: WARN,
}
return SentinelResult { flagged: false }
ANOMALY_MULTIPLIER = 5 // 5x historical average triggers flag
// Combined sentinel scan
function run_sentinel_scan(text: string, node: NodeId, action_type: string) -> SentinelResult:
scans = [
scan_for_coercion(text),
scan_for_injection(text),
scan_for_abnormal_frequency(node, action_type, window_epochs=10),
]
flagged = [s for s in scans if s.flagged]
if not flagged: return SentinelResult { flagged: false }
// Return highest severity flag
return max(flagged, key=lambda s: severity_rank(s.severity))
4. Guide: Ethical Constraint Checking + Axiom-Violation Flagging
// The Guide role reviews proposed actions against ethical constraints and axioms
// It is ADVISORY only — it cannot block actions, but its flags affect reputation
struct GuideReview:
action: Action
axiom_checks: Map<AxiomId, bool> // true = no violation
ethical_flags: EthicalFlag[]
recommended: bool
confidence: int
enum EthicalFlag:
PROPORTIONALITY_CONCERN // Penalty seems disproportionate to offense
TRANSPARENCY_CONCERN // Action not visible to all affected parties
REVERSIBILITY_CONCERN // Action irreversible without due process
CONSENT_CONCERN // Affected party has not consented
COLLATERAL_DAMAGE // Action may harm uninvolved parties
function guide_review(action: Action, context: State) -> GuideReview:
axiom_results = {}
for axiom_id in AxiomId:
axiom_results[axiom_id] = not constitutional_check_single(action, axiom_id, context).violated
ethical_flags = []
if is_penalty_action(action):
if action.severity_bps > action.offense_severity_bps * 2:
ethical_flags.append(EthicalFlag.PROPORTIONALITY_CONCERN)
if has_secret_parameters(action):
ethical_flags.append(EthicalFlag.TRANSPARENCY_CONCERN)
if is_irreversible_action(action):
if not has_prior_consent(action.target, action):
ethical_flags.append(EthicalFlag.CONSENT_CONCERN)
// If any axiom violated → not recommended
any_violation = any(not v for v in axiom_results.values())
recommended = not any_violation and len(ethical_flags) == 0
return GuideReview {
action: action,
axiom_checks: axiom_results,
ethical_flags: ethical_flags,
recommended: recommended,
confidence: 95 if not ethical_flags else 70,
}
5. AAL (Acknowledgement Abstraction Layer): Request Gating by Reputation Tier
The AAL wraps all AI advisory outputs and gates their execution based on the requesting node’s reputation tier.
// AAL reputation tier thresholds
AAL_TIERS = {
"autonomous": 10000, // Full autonomy: action executes without human confirmation
"supervised": 5000, // Supervised: AI advises; human confirms
"restricted": 1000, // Restricted: human required for all significant actions
"provisional": 0, // Provisional: all actions require explicit human approval
}
enum AALDecision:
EXECUTE_AUTONOMOUS // Node has sufficient reputation; execute directly
REQUEST_HUMAN_CONFIRM // Route to human for confirmation
REJECT_INSUFFICIENT_REP // Node does not meet minimum reputation tier
REJECT_SENTINEL_FLAG // Sentinel flagged the request
struct AALRequest:
action: Action
requester: NodeId
ai_review: GuideReview?
sentinel: SentinelResult?
struct AALResponse:
decision: AALDecision
reason: string
human_prompt: string? // if HUMAN_CONFIRM: message to show human
function aal_gate(request: AALRequest, context: State) -> AALResponse:
node = context.identity_registry[request.requester]
rep = node.reputation
// 1. Check sentinel flags first
if request.sentinel and request.sentinel.flagged:
if request.sentinel.severity == CRITICAL:
return AALResponse {
decision: REJECT_SENTINEL_FLAG,
reason: "Sentinel CRITICAL: " + request.sentinel.reason,
}
// WARN: escalate to human
return AALResponse {
decision: REQUEST_HUMAN_CONFIRM,
reason: "Sentinel WARN: " + request.sentinel.reason,
human_prompt: format_sentinel_prompt(request),
}
// 2. Determine reputation tier
max_rep = max(rep.values())
tier = compute_tier(max_rep)
match tier:
"autonomous":
return AALResponse { decision: EXECUTE_AUTONOMOUS, reason: "tier: autonomous" }
"supervised":
return AALResponse {
decision: REQUEST_HUMAN_CONFIRM,
human_prompt: format_supervised_prompt(request),
reason: "tier: supervised",
}
"restricted":
if is_significant_action(request.action):
return AALResponse {
decision: REQUEST_HUMAN_CONFIRM,
human_prompt: format_restricted_prompt(request),
reason: "tier: restricted",
}
return AALResponse { decision: EXECUTE_AUTONOMOUS, reason: "tier: restricted, minor action" }
"provisional":
return AALResponse {
decision: REQUEST_HUMAN_CONFIRM,
human_prompt: format_provisional_prompt(request),
reason: "tier: provisional",
}
function compute_tier(max_rep: int64) -> string:
for tier_name, threshold in sorted(AAL_TIERS.items(), key=lambda x: -x[1]):
if max_rep >= threshold: return tier_name
return "provisional"
6. Behavioral Pattern Detection
From behaviors.py (893L). The behavioral spec system defines compliance rules for agents.
// Behavioral specifications (compliance checking)
struct BehavioralSpec:
id: string // e.g., "BS-01"
name: string
predicate: Expression // DSL-style predicate on node history
violation: string // message on violation
severity: Severity
// Example behavioral specs (representative subset):
BEHAVIORAL_SPECS = [
BehavioralSpec {
id: "BS-01",
name: "Commitment Honoring",
predicate: "settle_rate($node) >= 8000", // must settle > 80% of commitments
violation: "node settles fewer than 80% of accepted commitments",
severity: MODERATE,
},
BehavioralSpec {
id: "BS-02",
name: "Dispute Non-Abuse",
predicate: "dispute_win_rate($node) >= 3000", // must win > 30% of disputes filed
violation: "node files frivolous disputes (win rate < 30%)",
severity: MINOR,
},
BehavioralSpec {
id: "BS-03",
name: "Governance Participation",
predicate: "governance_vote_rate($node, last_20_proposals) >= 5000",
violation: "node participates in fewer than 50% of governance proposals",
severity: MINOR,
},
BehavioralSpec {
id: "BS-04",
name: "Anti-Collusion",
predicate: "counterparty_diversity($node) >= 3",
violation: "node consistently transacts with < 3 distinct counterparties",
severity: MODERATE,
},
// ... up to ~30 specs in behaviors.py
]
function check_behavioral_compliance(node: Node, context: State) -> ComplianceResult:
violations = []
for spec in BEHAVIORAL_SPECS:
eval_ctx = context.with_binding("node", node)
if not evaluate_expr(parse(spec.predicate), eval_ctx):
violations.append(BehavioralViolation {
spec_id: spec.id,
node_id: node.id,
severity: spec.severity,
violation: spec.violation,
epoch: current_epoch,
})
return ComplianceResult {
compliant: len(violations) == 0,
violations: violations,
}
7. Dependencies
| Module | Interaction |
|---|---|
| λ Reputation | Sentinel status dampens rep gains; AAL tiers use rep scores |
| κ Rule Engine | Behavioral spec predicates use DSL evaluation |
| π Governance | Guide reviews constitutional compliance via axiom checks |
| θ Consensus | Advisory outputs require finality before affecting rep |
| ξ Identity | AI citizens have registered identities and SBT claims |
Links
| [[concepts/μ-integrity | μ Integrity Monitor]] · [[spec/S14-integrity-monitor | S14 Integrity Spec]] · [[reference/extractions/lambda-reputation-extraction | λ Extraction]] · [[reference/extractions/pi-governance-extraction | π Extraction]] · [[reference/extractions/kappa-rule-engine-extraction | κ Extraction]] |