dt-knowledge-api/knowledge.py at master · Life-Atlas/dt-knowledge-api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""
Knowledge store — loads LPI data into memory and provides keyword search.
Ported from LPI MCP Server knowledge-store.ts.
"""
import json
import os
from dataclasses import dataclass

DATA_DIR = os.path.join(os.path.dirname(__file__), "data")


@dataclass
class SearchResult:
    id: str
    title: str
    content: str
    tags: list[str]
    source: str
    score: float
    tier: str


class KnowledgeStore:
    def __init__(self):
        self.entries: list[dict] = []
        self.case_studies: list[dict] = []
        self.smile: dict = {}
        self._load()

    def _load(self):
        with open(os.path.join(DATA_DIR, "knowledge-base.json")) as f:
            self.entries = json.load(f)
        with open(os.path.join(DATA_DIR, "case-studies.json")) as f:
            self.case_studies = json.load(f)
        with open(os.path.join(DATA_DIR, "smile-framework.json")) as f:
            self.smile = json.load(f)

    def search(self, query: str, limit: int = 5, include_paid: bool = False) -> list[SearchResult]:
        # Clean query: lowercase, strip punctuation for matching
        import re
        clean_query = re.sub(r'[^\w\s]', '', query.lower())
        terms = clean_query.split()
        if not terms:
            return []

        # Detect beginner intent — boost intro content only when no specific topic terms
        beginner_signals = {"what", "how", "start", "begin", "getting", "started", "basics", "new", "explain", "introduction", "necessary"}
        topic_signals = {"smile", "edge", "interoperability", "ontology", "ant", "mim", "lean", "mvt", "sustainability", "energy", "building", "buildings", "maritime", "methodology", "phase", "phases"}
        has_beginner_words = len(beginner_signals.intersection(terms)) >= 1
        has_topic_words = len(topic_signals.intersection(terms)) >= 1
        is_beginner = has_beginner_words and not has_topic_words

        scored: list[tuple[dict, float]] = []

        for entry in self.entries:
            if not include_paid and entry.get("tier") == "paid":
                continue
            if entry.get("visibility") != "public":
                continue
            score = self._score_entry(entry, terms)
            # Boost beginner-friendly content only for truly introductory questions
            if is_beginner and any(t in entry.get("tags", []) for t in ["beginner", "introduction", "getting-started"]):
                score += 10.0
            if score > 0:
                scored.append((entry, score))

        # Also search case studies (free tier only)
        for cs in self.case_studies:
            if not include_paid and cs.get("tier") == "paid":
                continue
            score = self._score_case_study(cs, terms)
            if score > 0:
                scored.append((cs, score))

        scored.sort(key=lambda x: x[1], reverse=True)

        results = []
        for entry, score in scored[:limit]:
            results.append(SearchResult(
                id=entry.get("id", ""),
                title=entry.get("title", ""),
                content=entry.get("content") or entry.get("summary", ""),
                tags=entry.get("tags", entry.get("smilePhases", [])),
                source=entry.get("source", entry.get("industry", "")),
                score=score,
                tier=entry.get("tier", "free"),
            ))

        return results

    # Common words that shouldn't contribute to title/content matching
    STOP_WORDS = {"the", "a", "an", "is", "are", "was", "were", "be", "been",
                  "do", "does", "did", "have", "has", "had", "it", "its",
                  "to", "of", "in", "for", "on", "with", "at", "by", "from",
                  "and", "or", "but", "not", "no", "this", "that", "these",
                  "i", "me", "my", "we", "our", "you", "your", "they", "them",
                  "even", "also", "just", "very", "so", "too"}

    def _score_entry(self, entry: dict, terms: list[str]) -> float:
        score = 0.0
        title = entry.get("title", "").lower()
        content = entry.get("content", "").lower()
        tags = " ".join(entry.get("tags", [])).lower()

        for term in terms:
            if term in self.STOP_WORDS:
                continue
            if term in title:
                score += 3.0
            if term in tags:
                score += 2.0
            if term in content:
                score += 1.0

        return score

    def _score_case_study(self, cs: dict, terms: list[str]) -> float:
        score = 0.0
        title = cs.get("title", "").lower()
        challenge = cs.get("challenge", "").lower()
        approach = cs.get("approach", "").lower()
        outcome = cs.get("outcome", "").lower()
        industry = cs.get("industry", "").lower()

        for term in terms:
            if term in title:
                score += 3.0
            if term in industry:
                score += 2.0
            if term in challenge or term in approach or term in outcome:
                score += 1.0

        return score

    def get_smile_overview(self) -> dict:
        return {
            "methodology": self.smile.get("methodology", {}),
            "phases": [
                {"name": p["name"], "order": p["order"], "description": p["description"], "duration": p["duration"]}
                for p in self.smile.get("phases", [])
            ],
            "perspectives": self.smile.get("perspectives", []),
            "aiJourney": self.smile.get("aiJourney", []),
        }

    def get_phase_detail(self, phase_id: str) -> dict | None:
        for phase in self.smile.get("phases", []):
            if phase["id"] == phase_id:
                return phase
        return None

    def has_paid_matches(self, query: str) -> bool:
        """Check if query would match paid-tier content (for CTA triggers)."""
        terms = query.lower().split()
        for entry in self.entries:
            if entry.get("tier") == "paid":
                if self._score_entry(entry, terms) > 0:
                    return True
        for cs in self.case_studies:
            if cs.get("tier") == "paid":
                if self._score_case_study(cs, terms) > 0:
                    return True
        return False


# Singleton
store = KnowledgeStore()