Coverage for ai_integration/services/search_service.py: 54%
109 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-05 02:45 +0800
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-05 02:45 +0800
1"""
2Semantic search service using pgvector
3Searches across family content using vector similarity
4"""
5import logging
6from typing import List, Dict, Any, Union, Optional
7from django.db import models
8from django.db.models import Q, F
9from pgvector.django import CosineDistance, L2Distance
10from family.models import Story, Event, Heritage, Health, Person
11from .embedding_service import embedding_service
13logger = logging.getLogger(__name__)
16class SearchService:
17 """Service for semantic search across family content"""
19 # Model mappings for search
20 SEARCHABLE_MODELS = {
21 'story': Story,
22 'event': Event,
23 'heritage': Heritage,
24 'health': Health,
25 }
27 def __init__(self):
28 self.embedding_service = embedding_service
30 def semantic_search(
31 self,
32 query: str,
33 model_types: Optional[List[str]] = None,
34 limit: int = 10,
35 similarity_threshold: float = 0.7
36 ) -> List[Dict[str, Any]]:
37 """
38 Perform semantic search across family content
40 Args:
41 query: Search query text
42 model_types: List of model types to search ('story', 'event', etc.)
43 limit: Maximum number of results
44 similarity_threshold: Minimum similarity score (0-1)
46 Returns:
47 List of search results with metadata
48 """
49 if not query or not query.strip():
50 return []
52 # Generate query embedding
53 query_embedding = self.embedding_service.generate_embedding(query)
54 if not query_embedding: 54 ↛ 59line 54 didn't jump to line 59 because the condition on line 54 was always true
55 logger.error("Failed to generate query embedding")
56 return []
58 # Default to all searchable models
59 if not model_types:
60 model_types = list(self.SEARCHABLE_MODELS.keys())
62 all_results = []
64 # Search each model type
65 for model_type in model_types:
66 if model_type not in self.SEARCHABLE_MODELS:
67 logger.warning(f"Unknown model type: {model_type}")
68 continue
70 model_class = self.SEARCHABLE_MODELS[model_type]
71 results = self._search_model(
72 model_class,
73 query_embedding,
74 limit,
75 similarity_threshold
76 )
78 # Add model type to results
79 for result in results:
80 result['content_type'] = model_type
82 all_results.extend(results)
84 # Sort by similarity score and limit
85 all_results.sort(key=lambda x: x['similarity'], reverse=True)
86 return all_results[:limit]
88 def _search_model(
89 self,
90 model_class: models.Model,
91 query_embedding: List[float],
92 limit: int,
93 similarity_threshold: float
94 ) -> List[Dict[str, Any]]:
95 """Search a specific model class using vector similarity"""
96 try:
97 # Use cosine distance for similarity search
98 results = model_class.objects.filter(
99 content_embedding__isnull=False
100 ).annotate(
101 distance=CosineDistance('content_embedding', query_embedding)
102 ).annotate(
103 similarity=1 - F('distance') # Convert distance to similarity
104 ).filter(
105 similarity__gte=similarity_threshold
106 ).order_by('-similarity')[:limit]
108 search_results = []
109 for obj in results:
110 result = self._format_search_result(obj)
111 search_results.append(result)
113 logger.info(f"Found {len(search_results)} results in {model_class.__name__}")
114 return search_results
116 except Exception as e:
117 logger.error(f"Search failed for {model_class.__name__}: {e}")
118 return []
120 def _format_search_result(self, obj) -> Dict[str, Any]:
121 """Format model instance as search result"""
122 model_name = type(obj).__name__.lower()
124 # Extract relevant fields based on model type
125 if model_name == 'story':
126 return {
127 'id': obj.id,
128 'title': obj.title,
129 'content': obj.content[:200] + '...' if len(obj.content) > 200 else obj.content,
130 'story_type': obj.story_type,
131 'date_occurred': obj.date_occurred.isoformat() if obj.date_occurred else None,
132 'people': [p.name for p in obj.people.all()[:3]], # Limit to first 3
133 'similarity': float(obj.similarity),
134 'created_at': obj.created_at.isoformat(),
135 }
136 elif model_name == 'event':
137 return {
138 'id': obj.id,
139 'title': obj.name,
140 'content': obj.description[:200] + '...' if obj.description and len(obj.description) > 200 else obj.description,
141 'event_type': obj.event_type,
142 'start_date': obj.start_date.isoformat(),
143 'location': obj.location.name if obj.location else None,
144 'participants': [p.name for p in obj.participants.all()[:3]],
145 'similarity': float(obj.similarity),
146 'created_at': obj.created_at.isoformat(),
147 }
148 elif model_name == 'heritage':
149 return {
150 'id': obj.id,
151 'title': obj.title,
152 'content': obj.description[:200] + '...' if len(obj.description) > 200 else obj.description,
153 'heritage_type': obj.heritage_type,
154 'importance': obj.importance,
155 'origin_person': obj.origin_person.name if obj.origin_person else None,
156 'similarity': float(obj.similarity),
157 'created_at': obj.created_at.isoformat(),
158 }
159 elif model_name == 'health': 159 ↛ 173line 159 didn't jump to line 173 because the condition on line 159 was always true
160 return {
161 'id': obj.id,
162 'title': obj.title,
163 'content': obj.description[:200] + '...' if len(obj.description) > 200 else obj.description,
164 'record_type': obj.record_type,
165 'person': obj.person.name,
166 'date': obj.date.isoformat(),
167 'is_hereditary': obj.is_hereditary,
168 'similarity': float(obj.similarity),
169 'created_at': obj.created_at.isoformat(),
170 }
171 else:
172 # Generic format
173 return {
174 'id': obj.id,
175 'title': str(obj),
176 'content': '',
177 'similarity': float(obj.similarity),
178 'created_at': obj.created_at.isoformat() if hasattr(obj, 'created_at') else None,
179 }
181 def search_by_category(
182 self,
183 query: str,
184 category: str,
185 limit: int = 10
186 ) -> List[Dict[str, Any]]:
187 """
188 Search within a specific category/model type
190 Args:
191 query: Search query
192 category: Category to search ('stories', 'events', 'heritage', 'health')
193 limit: Maximum results
194 """
195 # Map category names to model types
196 category_mapping = {
197 'stories': 'story',
198 'events': 'event',
199 'heritage': 'heritage',
200 'health': 'health',
201 'memories': 'story', # Alias
202 'traditions': 'heritage', # Alias
203 }
205 model_type = category_mapping.get(category.lower())
206 if not model_type:
207 logger.warning(f"Unknown category: {category}")
208 return []
210 return self.semantic_search(query, [model_type], limit)
212 def find_related_content(
213 self,
214 content_id: int,
215 content_type: str,
216 limit: int = 5
217 ) -> List[Dict[str, Any]]:
218 """
219 Find content similar to a given piece of content
221 Args:
222 content_id: ID of the reference content
223 content_type: Type of reference content
224 limit: Maximum results
225 """
226 if content_type not in self.SEARCHABLE_MODELS:
227 return []
229 try:
230 # Get the reference object
231 model_class = self.SEARCHABLE_MODELS[content_type]
232 ref_obj = model_class.objects.get(id=content_id)
234 if not ref_obj.content_embedding: 234 ↛ anywhereline 234 didn't jump anywhere: it always raised an exception.
235 logger.warning(f"No embedding for {content_type}:{content_id}")
236 return []
238 # Search for similar content (excluding the reference object)
239 all_results = []
240 for model_type, search_model in self.SEARCHABLE_MODELS.items():
241 results = search_model.objects.filter(
242 content_embedding__isnull=False
243 ).exclude(
244 id=content_id if model_type == content_type else None
245 ).annotate(
246 distance=CosineDistance('content_embedding', ref_obj.content_embedding)
247 ).annotate(
248 similarity=1 - F('distance')
249 ).order_by('-similarity')[:limit]
251 for obj in results:
252 result = self._format_search_result(obj)
253 result['content_type'] = model_type
254 all_results.append(result)
256 # Sort and limit
257 all_results.sort(key=lambda x: x['similarity'], reverse=True)
258 return all_results[:limit]
260 except Exception as e:
261 logger.error(f"Failed to find related content: {e}")
262 return []
264 def keyword_search(
265 self,
266 query: str,
267 model_types: Optional[List[str]] = None,
268 limit: int = 10
269 ) -> List[Dict[str, Any]]:
270 """
271 Fallback keyword search when semantic search fails
272 """
273 if not model_types:
274 model_types = list(self.SEARCHABLE_MODELS.keys())
276 all_results = []
278 for model_type in model_types: 278 ↛ 304line 278 didn't jump to line 304 because the loop on line 278 didn't complete
279 if model_type not in self.SEARCHABLE_MODELS: 279 ↛ 280line 279 didn't jump to line 280 because the condition on line 279 was never true
280 continue
282 model_class = self.SEARCHABLE_MODELS[model_type]
284 # Build keyword search query
285 search_q = Q()
287 if model_type == 'story':
288 search_q = Q(title__icontains=query) | Q(content__icontains=query)
289 elif model_type == 'event': 289 ↛ 291line 289 didn't jump to line 291 because the condition on line 289 was always true
290 search_q = Q(name__icontains=query) | Q(description__icontains=query)
291 elif model_type == 'heritage':
292 search_q = Q(title__icontains=query) | Q(description__icontains=query)
293 elif model_type == 'health':
294 search_q = Q(title__icontains=query) | Q(description__icontains=query)
296 results = model_class.objects.filter(search_q)[:limit]
298 for obj in results: 298 ↛ 278line 298 didn't jump to line 278 because the loop on line 298 didn't complete
299 result = self._format_search_result(obj)
300 result['content_type'] = model_type
301 result['similarity'] = 0.5 # Default similarity for keyword search
302 all_results.append(result)
304 return all_results[:limit]
307# Global service instance
308search_service = SearchService()