344 lines
12 KiB
Python
344 lines
12 KiB
Python
|
|
"""Канонический внешний гиппокамп для автономных ИИ-агентов Montana.
|
|||
|
|
|
|||
|
|
Объединяет три прежних прототипа в одной интерфейс:
|
|||
|
|
- external_hippocampus.ExternalHippocampus — novelty levels, DNA hash, pattern completion
|
|||
|
|
- hippocampus.ExternalHippocampus — append-only stream.jsonl, статистика
|
|||
|
|
- cognitive_signature.CognitiveSignature — HMAC-SHA256 с domain separation на каждой записи
|
|||
|
|
|
|||
|
|
Двухуровневая архитектура с протоколом Montana:
|
|||
|
|
- Все записи живут локально в подписанном append-only stream.jsonl (приложение).
|
|||
|
|
- daily_anchor() сворачивает дневные записи в один DNA-хэш для коммита в Anchor (протокол).
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import hashlib
|
|||
|
|
import hmac
|
|||
|
|
import json
|
|||
|
|
import secrets
|
|||
|
|
from dataclasses import dataclass, field
|
|||
|
|
from datetime import datetime, timezone
|
|||
|
|
from enum import Enum
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Iterable, Optional
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
import numpy as np
|
|||
|
|
from sentence_transformers import SentenceTransformer
|
|||
|
|
HAS_EMBEDDINGS = True
|
|||
|
|
except ImportError:
|
|||
|
|
HAS_EMBEDDINGS = False
|
|||
|
|
|
|||
|
|
|
|||
|
|
class RecordKind(Enum):
|
|||
|
|
STATE = "agent.state"
|
|||
|
|
DECISION = "agent.decision"
|
|||
|
|
IDENTITY_CHANGE = "agent.identity_change"
|
|||
|
|
TRANSFER = "agent.transfer"
|
|||
|
|
ERROR = "agent.error"
|
|||
|
|
OBSERVATION = "agent.observation"
|
|||
|
|
|
|||
|
|
|
|||
|
|
class NoveltyLevel(Enum):
|
|||
|
|
ROUTINE = "routine"
|
|||
|
|
NOVEL = "novel"
|
|||
|
|
PREDICTION_ERROR = "prediction_error"
|
|||
|
|
|
|||
|
|
|
|||
|
|
@dataclass(frozen=True)
|
|||
|
|
class SignedRecord:
|
|||
|
|
record_id: str
|
|||
|
|
agent_id: str
|
|||
|
|
timestamp: str
|
|||
|
|
kind: str
|
|||
|
|
content: str
|
|||
|
|
metadata: dict
|
|||
|
|
novelty: str
|
|||
|
|
prev_id: Optional[str]
|
|||
|
|
signature: str
|
|||
|
|
|
|||
|
|
def to_dict(self) -> dict:
|
|||
|
|
return {
|
|||
|
|
"record_id": self.record_id,
|
|||
|
|
"agent_id": self.agent_id,
|
|||
|
|
"timestamp": self.timestamp,
|
|||
|
|
"kind": self.kind,
|
|||
|
|
"content": self.content,
|
|||
|
|
"metadata": self.metadata,
|
|||
|
|
"novelty": self.novelty,
|
|||
|
|
"prev_id": self.prev_id,
|
|||
|
|
"signature": self.signature,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
@classmethod
|
|||
|
|
def from_dict(cls, data: dict) -> "SignedRecord":
|
|||
|
|
return cls(**data)
|
|||
|
|
|
|||
|
|
|
|||
|
|
@dataclass
|
|||
|
|
class _NoveltyAssessor:
|
|||
|
|
"""Оценка новизны записи. Embedding-based если доступен, иначе word-frequency fallback."""
|
|||
|
|
|
|||
|
|
threshold_novel: float = 0.85
|
|||
|
|
threshold_routine: float = 0.50
|
|||
|
|
cache_size: int = 1000
|
|||
|
|
|
|||
|
|
embedder: Optional["SentenceTransformer"] = None
|
|||
|
|
_emb_cache: list[tuple[str, "np.ndarray"]] = field(default_factory=list)
|
|||
|
|
_word_counts: dict[str, int] = field(default_factory=dict)
|
|||
|
|
|
|||
|
|
def assess(self, content: str) -> NoveltyLevel:
|
|||
|
|
if self.embedder is not None:
|
|||
|
|
return self._assess_embedding(content)
|
|||
|
|
return self._assess_word_frequency(content)
|
|||
|
|
|
|||
|
|
def _assess_embedding(self, content: str) -> NoveltyLevel:
|
|||
|
|
emb = self.embedder.encode(content, convert_to_numpy=True, show_progress_bar=False)
|
|||
|
|
emb = emb / (np.linalg.norm(emb) + 1e-12)
|
|||
|
|
max_sim = 0.0
|
|||
|
|
for _, prev in self._emb_cache:
|
|||
|
|
sim = float(np.dot(emb, prev))
|
|||
|
|
if sim > max_sim:
|
|||
|
|
max_sim = sim
|
|||
|
|
self._emb_cache.append((content, emb))
|
|||
|
|
if len(self._emb_cache) > self.cache_size:
|
|||
|
|
self._emb_cache.pop(0)
|
|||
|
|
if max_sim < self.threshold_routine:
|
|||
|
|
return NoveltyLevel.PREDICTION_ERROR
|
|||
|
|
if max_sim < self.threshold_novel:
|
|||
|
|
return NoveltyLevel.NOVEL
|
|||
|
|
return NoveltyLevel.ROUTINE
|
|||
|
|
|
|||
|
|
def _assess_word_frequency(self, content: str) -> NoveltyLevel:
|
|||
|
|
words = [w for w in content.lower().split() if w]
|
|||
|
|
if not words:
|
|||
|
|
return NoveltyLevel.ROUTINE
|
|||
|
|
known = sum(1 for w in words if w in self._word_counts)
|
|||
|
|
novelty_ratio = 1.0 - (known / len(words))
|
|||
|
|
for w in words:
|
|||
|
|
self._word_counts[w] = self._word_counts.get(w, 0) + 1
|
|||
|
|
if novelty_ratio > 0.5:
|
|||
|
|
return NoveltyLevel.PREDICTION_ERROR
|
|||
|
|
if novelty_ratio > 0.2:
|
|||
|
|
return NoveltyLevel.NOVEL
|
|||
|
|
return NoveltyLevel.ROUTINE
|
|||
|
|
|
|||
|
|
|
|||
|
|
class AgentHippocampus:
|
|||
|
|
DOMAIN_PREFIX = b"montana.agent.hippocampus.v1"
|
|||
|
|
|
|||
|
|
def __init__(
|
|||
|
|
self,
|
|||
|
|
agent_id: str,
|
|||
|
|
signing_key: bytes,
|
|||
|
|
data_dir: Path | str,
|
|||
|
|
embedder: Optional["SentenceTransformer"] = None,
|
|||
|
|
):
|
|||
|
|
if len(signing_key) != 32:
|
|||
|
|
raise ValueError("signing_key must be exactly 32 bytes")
|
|||
|
|
self.agent_id = agent_id
|
|||
|
|
self._signing_key = signing_key
|
|||
|
|
self.data_dir = Path(data_dir)
|
|||
|
|
self.data_dir.mkdir(parents=True, exist_ok=True)
|
|||
|
|
self.stream_file = self.data_dir / "stream.jsonl"
|
|||
|
|
self._novelty = _NoveltyAssessor(embedder=embedder)
|
|||
|
|
self._tail_id: Optional[str] = self._load_tail_id()
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def generate_signing_key() -> bytes:
|
|||
|
|
return secrets.token_bytes(32)
|
|||
|
|
|
|||
|
|
def record(
|
|||
|
|
self,
|
|||
|
|
content: str,
|
|||
|
|
kind: RecordKind = RecordKind.STATE,
|
|||
|
|
metadata: Optional[dict] = None,
|
|||
|
|
) -> SignedRecord:
|
|||
|
|
if not content or not content.strip():
|
|||
|
|
raise ValueError("content must be non-empty")
|
|||
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|||
|
|
novelty = self._novelty.assess(content)
|
|||
|
|
meta = dict(metadata or {})
|
|||
|
|
record_id = self._compute_id(timestamp, content, meta)
|
|||
|
|
signature = self._sign(
|
|||
|
|
kind=kind.value,
|
|||
|
|
timestamp=timestamp,
|
|||
|
|
content=content,
|
|||
|
|
metadata=meta,
|
|||
|
|
prev_id=self._tail_id,
|
|||
|
|
)
|
|||
|
|
rec = SignedRecord(
|
|||
|
|
record_id=record_id,
|
|||
|
|
agent_id=self.agent_id,
|
|||
|
|
timestamp=timestamp,
|
|||
|
|
kind=kind.value,
|
|||
|
|
content=content,
|
|||
|
|
metadata=meta,
|
|||
|
|
novelty=novelty.value,
|
|||
|
|
prev_id=self._tail_id,
|
|||
|
|
signature=signature,
|
|||
|
|
)
|
|||
|
|
with self.stream_file.open("a", encoding="utf-8") as f:
|
|||
|
|
f.write(json.dumps(rec.to_dict(), ensure_ascii=False, sort_keys=True) + "\n")
|
|||
|
|
self._tail_id = record_id
|
|||
|
|
return rec
|
|||
|
|
|
|||
|
|
def verify(self, rec: SignedRecord) -> bool:
|
|||
|
|
expected = self._sign(
|
|||
|
|
kind=rec.kind,
|
|||
|
|
timestamp=rec.timestamp,
|
|||
|
|
content=rec.content,
|
|||
|
|
metadata=rec.metadata,
|
|||
|
|
prev_id=rec.prev_id,
|
|||
|
|
)
|
|||
|
|
return hmac.compare_digest(expected, rec.signature)
|
|||
|
|
|
|||
|
|
def verify_chain(self) -> tuple[bool, Optional[str]]:
|
|||
|
|
prev_id: Optional[str] = None
|
|||
|
|
for rec in self.iter_records():
|
|||
|
|
if not self.verify(rec):
|
|||
|
|
return False, f"signature mismatch at {rec.record_id}"
|
|||
|
|
if rec.prev_id != prev_id:
|
|||
|
|
return False, f"chain break at {rec.record_id} (prev_id={rec.prev_id}, expected={prev_id})"
|
|||
|
|
prev_id = rec.record_id
|
|||
|
|
return True, None
|
|||
|
|
|
|||
|
|
def iter_records(self) -> Iterable[SignedRecord]:
|
|||
|
|
if not self.stream_file.exists():
|
|||
|
|
return
|
|||
|
|
with self.stream_file.open("r", encoding="utf-8") as f:
|
|||
|
|
for line in f:
|
|||
|
|
line = line.strip()
|
|||
|
|
if not line:
|
|||
|
|
continue
|
|||
|
|
yield SignedRecord.from_dict(json.loads(line))
|
|||
|
|
|
|||
|
|
def selective_load(
|
|||
|
|
self,
|
|||
|
|
token_budget: int,
|
|||
|
|
chars_per_token: int = 4,
|
|||
|
|
include_routine: bool = False,
|
|||
|
|
) -> list[SignedRecord]:
|
|||
|
|
char_budget = token_budget * chars_per_token
|
|||
|
|
records = list(self.iter_records())
|
|||
|
|
if not include_routine:
|
|||
|
|
records = [r for r in records if r.novelty != NoveltyLevel.ROUTINE.value]
|
|||
|
|
records.reverse()
|
|||
|
|
chosen: list[SignedRecord] = []
|
|||
|
|
used = 0
|
|||
|
|
for rec in records:
|
|||
|
|
cost = len(rec.content)
|
|||
|
|
if used + cost > char_budget:
|
|||
|
|
continue
|
|||
|
|
chosen.append(rec)
|
|||
|
|
used += cost
|
|||
|
|
chosen.reverse()
|
|||
|
|
return chosen
|
|||
|
|
|
|||
|
|
def pattern_completion(self, query: str, top_k: int = 5) -> list[SignedRecord]:
|
|||
|
|
if self._novelty.embedder is None:
|
|||
|
|
q = query.lower()
|
|||
|
|
return [r for r in self.iter_records() if q in r.content.lower()][:top_k]
|
|||
|
|
q_emb = self._novelty.embedder.encode(query, convert_to_numpy=True, show_progress_bar=False)
|
|||
|
|
q_emb = q_emb / (np.linalg.norm(q_emb) + 1e-12)
|
|||
|
|
scored: list[tuple[float, SignedRecord]] = []
|
|||
|
|
for rec in self.iter_records():
|
|||
|
|
r_emb = self._novelty.embedder.encode(rec.content, convert_to_numpy=True, show_progress_bar=False)
|
|||
|
|
r_emb = r_emb / (np.linalg.norm(r_emb) + 1e-12)
|
|||
|
|
sim = float(np.dot(q_emb, r_emb))
|
|||
|
|
scored.append((sim, rec))
|
|||
|
|
scored.sort(key=lambda x: x[0], reverse=True)
|
|||
|
|
return [r for _, r in scored[:top_k]]
|
|||
|
|
|
|||
|
|
def daily_anchor(self, date: Optional[str] = None) -> dict:
|
|||
|
|
if date is None:
|
|||
|
|
date = datetime.now(timezone.utc).date().isoformat()
|
|||
|
|
records = [r for r in self.iter_records() if r.timestamp.startswith(date)]
|
|||
|
|
if not records:
|
|||
|
|
return {
|
|||
|
|
"agent_id": self.agent_id,
|
|||
|
|
"date": date,
|
|||
|
|
"count": 0,
|
|||
|
|
"dna_hash": None,
|
|||
|
|
"novelty_distribution": {},
|
|||
|
|
"first_id": None,
|
|||
|
|
"last_id": None,
|
|||
|
|
"anchor_payload_hash": None,
|
|||
|
|
}
|
|||
|
|
record_ids = sorted(r.record_id for r in records)
|
|||
|
|
dna = hashlib.sha256("|".join(record_ids).encode("utf-8")).hexdigest()
|
|||
|
|
novelty_dist: dict[str, int] = {}
|
|||
|
|
for r in records:
|
|||
|
|
novelty_dist[r.novelty] = novelty_dist.get(r.novelty, 0) + 1
|
|||
|
|
first_id = records[0].record_id
|
|||
|
|
last_id = records[-1].record_id
|
|||
|
|
payload = {
|
|||
|
|
"agent_id": self.agent_id,
|
|||
|
|
"date": date,
|
|||
|
|
"count": len(records),
|
|||
|
|
"dna_hash": dna,
|
|||
|
|
"novelty_distribution": novelty_dist,
|
|||
|
|
"first_id": first_id,
|
|||
|
|
"last_id": last_id,
|
|||
|
|
}
|
|||
|
|
payload_bytes = json.dumps(payload, sort_keys=True).encode("utf-8")
|
|||
|
|
payload["anchor_payload_hash"] = hashlib.sha256(payload_bytes).hexdigest()
|
|||
|
|
return payload
|
|||
|
|
|
|||
|
|
def stats(self) -> dict:
|
|||
|
|
records = list(self.iter_records())
|
|||
|
|
if not records:
|
|||
|
|
return {"count": 0, "novelty_distribution": {}, "first": None, "last": None}
|
|||
|
|
novelty: dict[str, int] = {}
|
|||
|
|
for r in records:
|
|||
|
|
novelty[r.novelty] = novelty.get(r.novelty, 0) + 1
|
|||
|
|
return {
|
|||
|
|
"agent_id": self.agent_id,
|
|||
|
|
"count": len(records),
|
|||
|
|
"novelty_distribution": novelty,
|
|||
|
|
"first": records[0].timestamp,
|
|||
|
|
"last": records[-1].timestamp,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def _sign(self, kind: str, timestamp: str, content: str, metadata: dict, prev_id: Optional[str]) -> str:
|
|||
|
|
payload = json.dumps(
|
|||
|
|
{
|
|||
|
|
"agent_id": self.agent_id,
|
|||
|
|
"kind": kind,
|
|||
|
|
"timestamp": timestamp,
|
|||
|
|
"content": content,
|
|||
|
|
"metadata": metadata,
|
|||
|
|
"prev_id": prev_id,
|
|||
|
|
},
|
|||
|
|
sort_keys=True,
|
|||
|
|
ensure_ascii=False,
|
|||
|
|
).encode("utf-8")
|
|||
|
|
message = self.DOMAIN_PREFIX + b"||" + payload
|
|||
|
|
return hmac.new(self._signing_key, message, hashlib.sha256).hexdigest()
|
|||
|
|
|
|||
|
|
def _compute_id(self, timestamp: str, content: str, metadata: dict) -> str:
|
|||
|
|
seed = json.dumps(
|
|||
|
|
{"agent_id": self.agent_id, "ts": timestamp, "content": content, "meta": metadata},
|
|||
|
|
sort_keys=True,
|
|||
|
|
ensure_ascii=False,
|
|||
|
|
).encode("utf-8")
|
|||
|
|
return hashlib.sha256(seed).hexdigest()[:16]
|
|||
|
|
|
|||
|
|
def _load_tail_id(self) -> Optional[str]:
|
|||
|
|
if not self.stream_file.exists():
|
|||
|
|
return None
|
|||
|
|
last_id: Optional[str] = None
|
|||
|
|
with self.stream_file.open("r", encoding="utf-8") as f:
|
|||
|
|
for line in f:
|
|||
|
|
line = line.strip()
|
|||
|
|
if line:
|
|||
|
|
last_id = json.loads(line).get("record_id")
|
|||
|
|
return last_id
|
|||
|
|
|
|||
|
|
|
|||
|
|
def load_embedder(model_name: str = "paraphrase-multilingual-MiniLM-L12-v2") -> Optional["SentenceTransformer"]:
|
|||
|
|
"""Загрузить эмбеддер. Возвращает None если sentence-transformers недоступен."""
|
|||
|
|
if not HAS_EMBEDDINGS:
|
|||
|
|
return None
|
|||
|
|
return SentenceTransformer(model_name)
|