"""Канонический внешний гиппокамп для автономных ИИ-агентов Montana. Объединяет три прежних прототипа в одной интерфейс: - external_hippocampus.ExternalHippocampus — novelty levels, DNA hash, pattern completion - hippocampus.ExternalHippocampus — append-only stream.jsonl, статистика - cognitive_signature.CognitiveSignature — HMAC-SHA256 с domain separation на каждой записи Двухуровневая архитектура с протоколом Montana: - Все записи живут локально в подписанном append-only stream.jsonl (приложение). - daily_anchor() сворачивает дневные записи в один DNA-хэш для коммита в Anchor (протокол). """ from __future__ import annotations import hashlib import hmac import json import secrets from dataclasses import dataclass, field from datetime import datetime, timezone from enum import Enum from pathlib import Path from typing import Iterable, Optional try: import numpy as np from sentence_transformers import SentenceTransformer HAS_EMBEDDINGS = True except Exception: # ImportError либо ValueError из частично сломанных пакетов (Keras 3 + transformers) HAS_EMBEDDINGS = False class RecordKind(Enum): STATE = "agent.state" DECISION = "agent.decision" IDENTITY_CHANGE = "agent.identity_change" TRANSFER = "agent.transfer" ERROR = "agent.error" OBSERVATION = "agent.observation" class NoveltyLevel(Enum): ROUTINE = "routine" NOVEL = "novel" PREDICTION_ERROR = "prediction_error" @dataclass(frozen=True) class SignedRecord: record_id: str agent_id: str timestamp: str kind: str content: str metadata: dict novelty: str prev_id: Optional[str] signature: str def to_dict(self) -> dict: return { "record_id": self.record_id, "agent_id": self.agent_id, "timestamp": self.timestamp, "kind": self.kind, "content": self.content, "metadata": self.metadata, "novelty": self.novelty, "prev_id": self.prev_id, "signature": self.signature, } @classmethod def from_dict(cls, data: dict) -> "SignedRecord": return cls(**data) @dataclass class _NoveltyAssessor: """Оценка новизны записи. Embedding-based если доступен, иначе word-frequency fallback.""" threshold_novel: float = 0.85 threshold_routine: float = 0.50 cache_size: int = 1000 embedder: Optional["SentenceTransformer"] = None _emb_cache: list[tuple[str, "np.ndarray"]] = field(default_factory=list) _word_counts: dict[str, int] = field(default_factory=dict) def assess(self, content: str) -> NoveltyLevel: if self.embedder is not None: return self._assess_embedding(content) return self._assess_word_frequency(content) def _assess_embedding(self, content: str) -> NoveltyLevel: emb = self.embedder.encode(content, convert_to_numpy=True, show_progress_bar=False) emb = emb / (np.linalg.norm(emb) + 1e-12) max_sim = 0.0 for _, prev in self._emb_cache: sim = float(np.dot(emb, prev)) if sim > max_sim: max_sim = sim self._emb_cache.append((content, emb)) if len(self._emb_cache) > self.cache_size: self._emb_cache.pop(0) if max_sim < self.threshold_routine: return NoveltyLevel.PREDICTION_ERROR if max_sim < self.threshold_novel: return NoveltyLevel.NOVEL return NoveltyLevel.ROUTINE def _assess_word_frequency(self, content: str) -> NoveltyLevel: words = [w for w in content.lower().split() if w] if not words: return NoveltyLevel.ROUTINE known = sum(1 for w in words if w in self._word_counts) novelty_ratio = 1.0 - (known / len(words)) for w in words: self._word_counts[w] = self._word_counts.get(w, 0) + 1 if novelty_ratio > 0.5: return NoveltyLevel.PREDICTION_ERROR if novelty_ratio > 0.2: return NoveltyLevel.NOVEL return NoveltyLevel.ROUTINE class AgentHippocampus: DOMAIN_PREFIX = b"montana.agent.hippocampus.v1" def __init__( self, agent_id: str, signing_key: bytes, data_dir: Path | str, embedder: Optional["SentenceTransformer"] = None, ): if len(signing_key) != 32: raise ValueError("signing_key must be exactly 32 bytes") self.agent_id = agent_id self._signing_key = signing_key self.data_dir = Path(data_dir) self.data_dir.mkdir(parents=True, exist_ok=True) self.stream_file = self.data_dir / "stream.jsonl" self._novelty = _NoveltyAssessor(embedder=embedder) self._tail_id: Optional[str] = self._load_tail_id() @staticmethod def generate_signing_key() -> bytes: return secrets.token_bytes(32) def record( self, content: str, kind: RecordKind = RecordKind.STATE, metadata: Optional[dict] = None, ) -> SignedRecord: if not content or not content.strip(): raise ValueError("content must be non-empty") timestamp = datetime.now(timezone.utc).isoformat() novelty = self._novelty.assess(content) meta = dict(metadata or {}) record_id = self._compute_id(timestamp, content, meta) signature = self._sign( kind=kind.value, timestamp=timestamp, content=content, metadata=meta, prev_id=self._tail_id, ) rec = SignedRecord( record_id=record_id, agent_id=self.agent_id, timestamp=timestamp, kind=kind.value, content=content, metadata=meta, novelty=novelty.value, prev_id=self._tail_id, signature=signature, ) with self.stream_file.open("a", encoding="utf-8") as f: f.write(json.dumps(rec.to_dict(), ensure_ascii=False, sort_keys=True) + "\n") self._tail_id = record_id return rec def verify(self, rec: SignedRecord) -> bool: expected = self._sign( kind=rec.kind, timestamp=rec.timestamp, content=rec.content, metadata=rec.metadata, prev_id=rec.prev_id, ) return hmac.compare_digest(expected, rec.signature) def verify_chain(self) -> tuple[bool, Optional[str]]: prev_id: Optional[str] = None for rec in self.iter_records(): if not self.verify(rec): return False, f"signature mismatch at {rec.record_id}" if rec.prev_id != prev_id: return False, f"chain break at {rec.record_id} (prev_id={rec.prev_id}, expected={prev_id})" prev_id = rec.record_id return True, None def iter_records(self) -> Iterable[SignedRecord]: if not self.stream_file.exists(): return with self.stream_file.open("r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue yield SignedRecord.from_dict(json.loads(line)) def selective_load( self, token_budget: int, chars_per_token: int = 4, include_routine: bool = False, ) -> list[SignedRecord]: char_budget = token_budget * chars_per_token records = list(self.iter_records()) if not include_routine: records = [r for r in records if r.novelty != NoveltyLevel.ROUTINE.value] records.reverse() chosen: list[SignedRecord] = [] used = 0 for rec in records: cost = len(rec.content) if used + cost > char_budget: continue chosen.append(rec) used += cost chosen.reverse() return chosen def pattern_completion(self, query: str, top_k: int = 5) -> list[SignedRecord]: if self._novelty.embedder is None: q = query.lower() return [r for r in self.iter_records() if q in r.content.lower()][:top_k] q_emb = self._novelty.embedder.encode(query, convert_to_numpy=True, show_progress_bar=False) q_emb = q_emb / (np.linalg.norm(q_emb) + 1e-12) scored: list[tuple[float, SignedRecord]] = [] for rec in self.iter_records(): r_emb = self._novelty.embedder.encode(rec.content, convert_to_numpy=True, show_progress_bar=False) r_emb = r_emb / (np.linalg.norm(r_emb) + 1e-12) sim = float(np.dot(q_emb, r_emb)) scored.append((sim, rec)) scored.sort(key=lambda x: x[0], reverse=True) return [r for _, r in scored[:top_k]] def daily_anchor(self, date: Optional[str] = None) -> dict: if date is None: date = datetime.now(timezone.utc).date().isoformat() records = [r for r in self.iter_records() if r.timestamp.startswith(date)] if not records: return { "agent_id": self.agent_id, "date": date, "count": 0, "dna_hash": None, "novelty_distribution": {}, "first_id": None, "last_id": None, "anchor_payload_hash": None, } record_ids = sorted(r.record_id for r in records) dna = hashlib.sha256("|".join(record_ids).encode("utf-8")).hexdigest() novelty_dist: dict[str, int] = {} for r in records: novelty_dist[r.novelty] = novelty_dist.get(r.novelty, 0) + 1 first_id = records[0].record_id last_id = records[-1].record_id payload = { "agent_id": self.agent_id, "date": date, "count": len(records), "dna_hash": dna, "novelty_distribution": novelty_dist, "first_id": first_id, "last_id": last_id, } payload_bytes = json.dumps(payload, sort_keys=True).encode("utf-8") payload["anchor_payload_hash"] = hashlib.sha256(payload_bytes).hexdigest() return payload def stats(self) -> dict: records = list(self.iter_records()) if not records: return {"count": 0, "novelty_distribution": {}, "first": None, "last": None} novelty: dict[str, int] = {} for r in records: novelty[r.novelty] = novelty.get(r.novelty, 0) + 1 return { "agent_id": self.agent_id, "count": len(records), "novelty_distribution": novelty, "first": records[0].timestamp, "last": records[-1].timestamp, } def _sign(self, kind: str, timestamp: str, content: str, metadata: dict, prev_id: Optional[str]) -> str: payload = json.dumps( { "agent_id": self.agent_id, "kind": kind, "timestamp": timestamp, "content": content, "metadata": metadata, "prev_id": prev_id, }, sort_keys=True, ensure_ascii=False, ).encode("utf-8") message = self.DOMAIN_PREFIX + b"||" + payload return hmac.new(self._signing_key, message, hashlib.sha256).hexdigest() def _compute_id(self, timestamp: str, content: str, metadata: dict) -> str: seed = json.dumps( {"agent_id": self.agent_id, "ts": timestamp, "content": content, "meta": metadata}, sort_keys=True, ensure_ascii=False, ).encode("utf-8") return hashlib.sha256(seed).hexdigest()[:16] def _load_tail_id(self) -> Optional[str]: if not self.stream_file.exists(): return None last_id: Optional[str] = None with self.stream_file.open("r", encoding="utf-8") as f: for line in f: line = line.strip() if line: last_id = json.loads(line).get("record_id") return last_id def load_embedder(model_name: str = "paraphrase-multilingual-MiniLM-L12-v2") -> Optional["SentenceTransformer"]: """Загрузить эмбеддер. Возвращает None если sentence-transformers недоступен.""" if not HAS_EMBEDDINGS: return None return SentenceTransformer(model_name)