Coverage for ai_integration/services/search_service.py: 54%

109 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-05 02:45 +0800

1""" 

2Semantic search service using pgvector 

3Searches across family content using vector similarity 

4""" 

5import logging 

6from typing import List, Dict, Any, Union, Optional 

7from django.db import models 

8from django.db.models import Q, F 

9from pgvector.django import CosineDistance, L2Distance 

10from family.models import Story, Event, Heritage, Health, Person 

11from .embedding_service import embedding_service 

12 

13logger = logging.getLogger(__name__) 

14 

15 

16class SearchService: 

17 """Service for semantic search across family content""" 

18 

19 # Model mappings for search 

20 SEARCHABLE_MODELS = { 

21 'story': Story, 

22 'event': Event, 

23 'heritage': Heritage, 

24 'health': Health, 

25 } 

26 

27 def __init__(self): 

28 self.embedding_service = embedding_service 

29 

30 def semantic_search( 

31 self, 

32 query: str, 

33 model_types: Optional[List[str]] = None, 

34 limit: int = 10, 

35 similarity_threshold: float = 0.7 

36 ) -> List[Dict[str, Any]]: 

37 """ 

38 Perform semantic search across family content 

39  

40 Args: 

41 query: Search query text 

42 model_types: List of model types to search ('story', 'event', etc.) 

43 limit: Maximum number of results 

44 similarity_threshold: Minimum similarity score (0-1) 

45  

46 Returns: 

47 List of search results with metadata 

48 """ 

49 if not query or not query.strip(): 

50 return [] 

51 

52 # Generate query embedding 

53 query_embedding = self.embedding_service.generate_embedding(query) 

54 if not query_embedding: 54 ↛ 59line 54 didn't jump to line 59 because the condition on line 54 was always true

55 logger.error("Failed to generate query embedding") 

56 return [] 

57 

58 # Default to all searchable models 

59 if not model_types: 

60 model_types = list(self.SEARCHABLE_MODELS.keys()) 

61 

62 all_results = [] 

63 

64 # Search each model type 

65 for model_type in model_types: 

66 if model_type not in self.SEARCHABLE_MODELS: 

67 logger.warning(f"Unknown model type: {model_type}") 

68 continue 

69 

70 model_class = self.SEARCHABLE_MODELS[model_type] 

71 results = self._search_model( 

72 model_class, 

73 query_embedding, 

74 limit, 

75 similarity_threshold 

76 ) 

77 

78 # Add model type to results 

79 for result in results: 

80 result['content_type'] = model_type 

81 

82 all_results.extend(results) 

83 

84 # Sort by similarity score and limit 

85 all_results.sort(key=lambda x: x['similarity'], reverse=True) 

86 return all_results[:limit] 

87 

88 def _search_model( 

89 self, 

90 model_class: models.Model, 

91 query_embedding: List[float], 

92 limit: int, 

93 similarity_threshold: float 

94 ) -> List[Dict[str, Any]]: 

95 """Search a specific model class using vector similarity""" 

96 try: 

97 # Use cosine distance for similarity search 

98 results = model_class.objects.filter( 

99 content_embedding__isnull=False 

100 ).annotate( 

101 distance=CosineDistance('content_embedding', query_embedding) 

102 ).annotate( 

103 similarity=1 - F('distance') # Convert distance to similarity 

104 ).filter( 

105 similarity__gte=similarity_threshold 

106 ).order_by('-similarity')[:limit] 

107 

108 search_results = [] 

109 for obj in results: 

110 result = self._format_search_result(obj) 

111 search_results.append(result) 

112 

113 logger.info(f"Found {len(search_results)} results in {model_class.__name__}") 

114 return search_results 

115 

116 except Exception as e: 

117 logger.error(f"Search failed for {model_class.__name__}: {e}") 

118 return [] 

119 

120 def _format_search_result(self, obj) -> Dict[str, Any]: 

121 """Format model instance as search result""" 

122 model_name = type(obj).__name__.lower() 

123 

124 # Extract relevant fields based on model type 

125 if model_name == 'story': 

126 return { 

127 'id': obj.id, 

128 'title': obj.title, 

129 'content': obj.content[:200] + '...' if len(obj.content) > 200 else obj.content, 

130 'story_type': obj.story_type, 

131 'date_occurred': obj.date_occurred.isoformat() if obj.date_occurred else None, 

132 'people': [p.name for p in obj.people.all()[:3]], # Limit to first 3 

133 'similarity': float(obj.similarity), 

134 'created_at': obj.created_at.isoformat(), 

135 } 

136 elif model_name == 'event': 

137 return { 

138 'id': obj.id, 

139 'title': obj.name, 

140 'content': obj.description[:200] + '...' if obj.description and len(obj.description) > 200 else obj.description, 

141 'event_type': obj.event_type, 

142 'start_date': obj.start_date.isoformat(), 

143 'location': obj.location.name if obj.location else None, 

144 'participants': [p.name for p in obj.participants.all()[:3]], 

145 'similarity': float(obj.similarity), 

146 'created_at': obj.created_at.isoformat(), 

147 } 

148 elif model_name == 'heritage': 

149 return { 

150 'id': obj.id, 

151 'title': obj.title, 

152 'content': obj.description[:200] + '...' if len(obj.description) > 200 else obj.description, 

153 'heritage_type': obj.heritage_type, 

154 'importance': obj.importance, 

155 'origin_person': obj.origin_person.name if obj.origin_person else None, 

156 'similarity': float(obj.similarity), 

157 'created_at': obj.created_at.isoformat(), 

158 } 

159 elif model_name == 'health': 159 ↛ 173line 159 didn't jump to line 173 because the condition on line 159 was always true

160 return { 

161 'id': obj.id, 

162 'title': obj.title, 

163 'content': obj.description[:200] + '...' if len(obj.description) > 200 else obj.description, 

164 'record_type': obj.record_type, 

165 'person': obj.person.name, 

166 'date': obj.date.isoformat(), 

167 'is_hereditary': obj.is_hereditary, 

168 'similarity': float(obj.similarity), 

169 'created_at': obj.created_at.isoformat(), 

170 } 

171 else: 

172 # Generic format 

173 return { 

174 'id': obj.id, 

175 'title': str(obj), 

176 'content': '', 

177 'similarity': float(obj.similarity), 

178 'created_at': obj.created_at.isoformat() if hasattr(obj, 'created_at') else None, 

179 } 

180 

181 def search_by_category( 

182 self, 

183 query: str, 

184 category: str, 

185 limit: int = 10 

186 ) -> List[Dict[str, Any]]: 

187 """ 

188 Search within a specific category/model type 

189  

190 Args: 

191 query: Search query 

192 category: Category to search ('stories', 'events', 'heritage', 'health') 

193 limit: Maximum results 

194 """ 

195 # Map category names to model types 

196 category_mapping = { 

197 'stories': 'story', 

198 'events': 'event', 

199 'heritage': 'heritage', 

200 'health': 'health', 

201 'memories': 'story', # Alias 

202 'traditions': 'heritage', # Alias 

203 } 

204 

205 model_type = category_mapping.get(category.lower()) 

206 if not model_type: 

207 logger.warning(f"Unknown category: {category}") 

208 return [] 

209 

210 return self.semantic_search(query, [model_type], limit) 

211 

212 def find_related_content( 

213 self, 

214 content_id: int, 

215 content_type: str, 

216 limit: int = 5 

217 ) -> List[Dict[str, Any]]: 

218 """ 

219 Find content similar to a given piece of content 

220  

221 Args: 

222 content_id: ID of the reference content 

223 content_type: Type of reference content 

224 limit: Maximum results 

225 """ 

226 if content_type not in self.SEARCHABLE_MODELS: 

227 return [] 

228 

229 try: 

230 # Get the reference object 

231 model_class = self.SEARCHABLE_MODELS[content_type] 

232 ref_obj = model_class.objects.get(id=content_id) 

233 

234 if not ref_obj.content_embedding: 234 ↛ anywhereline 234 didn't jump anywhere: it always raised an exception.

235 logger.warning(f"No embedding for {content_type}:{content_id}") 

236 return [] 

237 

238 # Search for similar content (excluding the reference object) 

239 all_results = [] 

240 for model_type, search_model in self.SEARCHABLE_MODELS.items(): 

241 results = search_model.objects.filter( 

242 content_embedding__isnull=False 

243 ).exclude( 

244 id=content_id if model_type == content_type else None 

245 ).annotate( 

246 distance=CosineDistance('content_embedding', ref_obj.content_embedding) 

247 ).annotate( 

248 similarity=1 - F('distance') 

249 ).order_by('-similarity')[:limit] 

250 

251 for obj in results: 

252 result = self._format_search_result(obj) 

253 result['content_type'] = model_type 

254 all_results.append(result) 

255 

256 # Sort and limit 

257 all_results.sort(key=lambda x: x['similarity'], reverse=True) 

258 return all_results[:limit] 

259 

260 except Exception as e: 

261 logger.error(f"Failed to find related content: {e}") 

262 return [] 

263 

264 def keyword_search( 

265 self, 

266 query: str, 

267 model_types: Optional[List[str]] = None, 

268 limit: int = 10 

269 ) -> List[Dict[str, Any]]: 

270 """ 

271 Fallback keyword search when semantic search fails 

272 """ 

273 if not model_types: 

274 model_types = list(self.SEARCHABLE_MODELS.keys()) 

275 

276 all_results = [] 

277 

278 for model_type in model_types: 278 ↛ 304line 278 didn't jump to line 304 because the loop on line 278 didn't complete

279 if model_type not in self.SEARCHABLE_MODELS: 279 ↛ 280line 279 didn't jump to line 280 because the condition on line 279 was never true

280 continue 

281 

282 model_class = self.SEARCHABLE_MODELS[model_type] 

283 

284 # Build keyword search query 

285 search_q = Q() 

286 

287 if model_type == 'story': 

288 search_q = Q(title__icontains=query) | Q(content__icontains=query) 

289 elif model_type == 'event': 289 ↛ 291line 289 didn't jump to line 291 because the condition on line 289 was always true

290 search_q = Q(name__icontains=query) | Q(description__icontains=query) 

291 elif model_type == 'heritage': 

292 search_q = Q(title__icontains=query) | Q(description__icontains=query) 

293 elif model_type == 'health': 

294 search_q = Q(title__icontains=query) | Q(description__icontains=query) 

295 

296 results = model_class.objects.filter(search_q)[:limit] 

297 

298 for obj in results: 298 ↛ 278line 298 didn't jump to line 278 because the loop on line 298 didn't complete

299 result = self._format_search_result(obj) 

300 result['content_type'] = model_type 

301 result['similarity'] = 0.5 # Default similarity for keyword search 

302 all_results.append(result) 

303 

304 return all_results[:limit] 

305 

306 

307# Global service instance 

308search_service = SearchService()