montana/Русский/Гиппокамп/agent_hippocampus.py

344 lines
12 KiB
Python
Raw Normal View History

"""Канонический внешний гиппокамп для автономных ИИ-агентов Montana.
Объединяет три прежних прототипа в одной интерфейс:
- external_hippocampus.ExternalHippocampus novelty levels, DNA hash, pattern completion
- hippocampus.ExternalHippocampus append-only stream.jsonl, статистика
- cognitive_signature.CognitiveSignature HMAC-SHA256 с domain separation на каждой записи
Двухуровневая архитектура с протоколом Montana:
- Все записи живут локально в подписанном append-only stream.jsonl (приложение).
- daily_anchor() сворачивает дневные записи в один DNA-хэш для коммита в Anchor (протокол).
"""
from __future__ import annotations
import hashlib
import hmac
import json
import secrets
from dataclasses import dataclass, field
from datetime import datetime, timezone
from enum import Enum
from pathlib import Path
from typing import Iterable, Optional
try:
import numpy as np
from sentence_transformers import SentenceTransformer
HAS_EMBEDDINGS = True
2026-05-04 02:45:01 +03:00
except Exception: # ImportError либо ValueError из частично сломанных пакетов (Keras 3 + transformers)
HAS_EMBEDDINGS = False
class RecordKind(Enum):
STATE = "agent.state"
DECISION = "agent.decision"
IDENTITY_CHANGE = "agent.identity_change"
TRANSFER = "agent.transfer"
ERROR = "agent.error"
OBSERVATION = "agent.observation"
class NoveltyLevel(Enum):
ROUTINE = "routine"
NOVEL = "novel"
PREDICTION_ERROR = "prediction_error"
@dataclass(frozen=True)
class SignedRecord:
record_id: str
agent_id: str
timestamp: str
kind: str
content: str
metadata: dict
novelty: str
prev_id: Optional[str]
signature: str
def to_dict(self) -> dict:
return {
"record_id": self.record_id,
"agent_id": self.agent_id,
"timestamp": self.timestamp,
"kind": self.kind,
"content": self.content,
"metadata": self.metadata,
"novelty": self.novelty,
"prev_id": self.prev_id,
"signature": self.signature,
}
@classmethod
def from_dict(cls, data: dict) -> "SignedRecord":
return cls(**data)
@dataclass
class _NoveltyAssessor:
"""Оценка новизны записи. Embedding-based если доступен, иначе word-frequency fallback."""
threshold_novel: float = 0.85
threshold_routine: float = 0.50
cache_size: int = 1000
embedder: Optional["SentenceTransformer"] = None
_emb_cache: list[tuple[str, "np.ndarray"]] = field(default_factory=list)
_word_counts: dict[str, int] = field(default_factory=dict)
def assess(self, content: str) -> NoveltyLevel:
if self.embedder is not None:
return self._assess_embedding(content)
return self._assess_word_frequency(content)
def _assess_embedding(self, content: str) -> NoveltyLevel:
emb = self.embedder.encode(content, convert_to_numpy=True, show_progress_bar=False)
emb = emb / (np.linalg.norm(emb) + 1e-12)
max_sim = 0.0
for _, prev in self._emb_cache:
sim = float(np.dot(emb, prev))
if sim > max_sim:
max_sim = sim
self._emb_cache.append((content, emb))
if len(self._emb_cache) > self.cache_size:
self._emb_cache.pop(0)
if max_sim < self.threshold_routine:
return NoveltyLevel.PREDICTION_ERROR
if max_sim < self.threshold_novel:
return NoveltyLevel.NOVEL
return NoveltyLevel.ROUTINE
def _assess_word_frequency(self, content: str) -> NoveltyLevel:
words = [w for w in content.lower().split() if w]
if not words:
return NoveltyLevel.ROUTINE
known = sum(1 for w in words if w in self._word_counts)
novelty_ratio = 1.0 - (known / len(words))
for w in words:
self._word_counts[w] = self._word_counts.get(w, 0) + 1
if novelty_ratio > 0.5:
return NoveltyLevel.PREDICTION_ERROR
if novelty_ratio > 0.2:
return NoveltyLevel.NOVEL
return NoveltyLevel.ROUTINE
class AgentHippocampus:
DOMAIN_PREFIX = b"montana.agent.hippocampus.v1"
def __init__(
self,
agent_id: str,
signing_key: bytes,
data_dir: Path | str,
embedder: Optional["SentenceTransformer"] = None,
):
if len(signing_key) != 32:
raise ValueError("signing_key must be exactly 32 bytes")
self.agent_id = agent_id
self._signing_key = signing_key
self.data_dir = Path(data_dir)
self.data_dir.mkdir(parents=True, exist_ok=True)
self.stream_file = self.data_dir / "stream.jsonl"
self._novelty = _NoveltyAssessor(embedder=embedder)
self._tail_id: Optional[str] = self._load_tail_id()
@staticmethod
def generate_signing_key() -> bytes:
return secrets.token_bytes(32)
def record(
self,
content: str,
kind: RecordKind = RecordKind.STATE,
metadata: Optional[dict] = None,
) -> SignedRecord:
if not content or not content.strip():
raise ValueError("content must be non-empty")
timestamp = datetime.now(timezone.utc).isoformat()
novelty = self._novelty.assess(content)
meta = dict(metadata or {})
record_id = self._compute_id(timestamp, content, meta)
signature = self._sign(
kind=kind.value,
timestamp=timestamp,
content=content,
metadata=meta,
prev_id=self._tail_id,
)
rec = SignedRecord(
record_id=record_id,
agent_id=self.agent_id,
timestamp=timestamp,
kind=kind.value,
content=content,
metadata=meta,
novelty=novelty.value,
prev_id=self._tail_id,
signature=signature,
)
with self.stream_file.open("a", encoding="utf-8") as f:
f.write(json.dumps(rec.to_dict(), ensure_ascii=False, sort_keys=True) + "\n")
self._tail_id = record_id
return rec
def verify(self, rec: SignedRecord) -> bool:
expected = self._sign(
kind=rec.kind,
timestamp=rec.timestamp,
content=rec.content,
metadata=rec.metadata,
prev_id=rec.prev_id,
)
return hmac.compare_digest(expected, rec.signature)
def verify_chain(self) -> tuple[bool, Optional[str]]:
prev_id: Optional[str] = None
for rec in self.iter_records():
if not self.verify(rec):
return False, f"signature mismatch at {rec.record_id}"
if rec.prev_id != prev_id:
return False, f"chain break at {rec.record_id} (prev_id={rec.prev_id}, expected={prev_id})"
prev_id = rec.record_id
return True, None
def iter_records(self) -> Iterable[SignedRecord]:
if not self.stream_file.exists():
return
with self.stream_file.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
yield SignedRecord.from_dict(json.loads(line))
def selective_load(
self,
token_budget: int,
chars_per_token: int = 4,
include_routine: bool = False,
) -> list[SignedRecord]:
char_budget = token_budget * chars_per_token
records = list(self.iter_records())
if not include_routine:
records = [r for r in records if r.novelty != NoveltyLevel.ROUTINE.value]
records.reverse()
chosen: list[SignedRecord] = []
used = 0
for rec in records:
cost = len(rec.content)
if used + cost > char_budget:
continue
chosen.append(rec)
used += cost
chosen.reverse()
return chosen
def pattern_completion(self, query: str, top_k: int = 5) -> list[SignedRecord]:
if self._novelty.embedder is None:
q = query.lower()
return [r for r in self.iter_records() if q in r.content.lower()][:top_k]
q_emb = self._novelty.embedder.encode(query, convert_to_numpy=True, show_progress_bar=False)
q_emb = q_emb / (np.linalg.norm(q_emb) + 1e-12)
scored: list[tuple[float, SignedRecord]] = []
for rec in self.iter_records():
r_emb = self._novelty.embedder.encode(rec.content, convert_to_numpy=True, show_progress_bar=False)
r_emb = r_emb / (np.linalg.norm(r_emb) + 1e-12)
sim = float(np.dot(q_emb, r_emb))
scored.append((sim, rec))
scored.sort(key=lambda x: x[0], reverse=True)
return [r for _, r in scored[:top_k]]
def daily_anchor(self, date: Optional[str] = None) -> dict:
if date is None:
date = datetime.now(timezone.utc).date().isoformat()
records = [r for r in self.iter_records() if r.timestamp.startswith(date)]
if not records:
return {
"agent_id": self.agent_id,
"date": date,
"count": 0,
"dna_hash": None,
"novelty_distribution": {},
"first_id": None,
"last_id": None,
"anchor_payload_hash": None,
}
record_ids = sorted(r.record_id for r in records)
dna = hashlib.sha256("|".join(record_ids).encode("utf-8")).hexdigest()
novelty_dist: dict[str, int] = {}
for r in records:
novelty_dist[r.novelty] = novelty_dist.get(r.novelty, 0) + 1
first_id = records[0].record_id
last_id = records[-1].record_id
payload = {
"agent_id": self.agent_id,
"date": date,
"count": len(records),
"dna_hash": dna,
"novelty_distribution": novelty_dist,
"first_id": first_id,
"last_id": last_id,
}
payload_bytes = json.dumps(payload, sort_keys=True).encode("utf-8")
payload["anchor_payload_hash"] = hashlib.sha256(payload_bytes).hexdigest()
return payload
def stats(self) -> dict:
records = list(self.iter_records())
if not records:
return {"count": 0, "novelty_distribution": {}, "first": None, "last": None}
novelty: dict[str, int] = {}
for r in records:
novelty[r.novelty] = novelty.get(r.novelty, 0) + 1
return {
"agent_id": self.agent_id,
"count": len(records),
"novelty_distribution": novelty,
"first": records[0].timestamp,
"last": records[-1].timestamp,
}
def _sign(self, kind: str, timestamp: str, content: str, metadata: dict, prev_id: Optional[str]) -> str:
payload = json.dumps(
{
"agent_id": self.agent_id,
"kind": kind,
"timestamp": timestamp,
"content": content,
"metadata": metadata,
"prev_id": prev_id,
},
sort_keys=True,
ensure_ascii=False,
).encode("utf-8")
message = self.DOMAIN_PREFIX + b"||" + payload
return hmac.new(self._signing_key, message, hashlib.sha256).hexdigest()
def _compute_id(self, timestamp: str, content: str, metadata: dict) -> str:
seed = json.dumps(
{"agent_id": self.agent_id, "ts": timestamp, "content": content, "meta": metadata},
sort_keys=True,
ensure_ascii=False,
).encode("utf-8")
return hashlib.sha256(seed).hexdigest()[:16]
def _load_tail_id(self) -> Optional[str]:
if not self.stream_file.exists():
return None
last_id: Optional[str] = None
with self.stream_file.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
last_id = json.loads(line).get("record_id")
return last_id
def load_embedder(model_name: str = "paraphrase-multilingual-MiniLM-L12-v2") -> Optional["SentenceTransformer"]:
"""Загрузить эмбеддер. Возвращает None если sentence-transformers недоступен."""
if not HAS_EMBEDDINGS:
return None
return SentenceTransformer(model_name)