ソースを参照

大优化, 截取准确大提高

AnlaAnla 1 ヶ月 前
コミット
571f2d8cd4
4 ファイル変更29 行追加20 行削除
  1. 1 1
      Test/RapidOCR_test.py
  2. 1 1
      Test/seg_test02.py
  3. 5 4
      app/core/config.py
  4. 22 14
      app/services/video_service.py

+ 1 - 1
Test/RapidOCR_test.py

@@ -2,7 +2,7 @@ from rapidocr import RapidOCR
 
 engine = RapidOCR()
 
-img_url = r"C:\Code\ML\Image\_TEST_DATA\Card_test\test05\945e0cc0884c8766a5883ea9593def9d.png"
+img_url = r"C:\Code\ML\Project\CardVideoSummary\static\frames\1c4e0b13-c22a-4b24-adc9-633ae8148d2c_18047000.jpg"
 result = engine(img_url)
 print(result)
 

+ 1 - 1
Test/seg_test02.py

@@ -43,5 +43,5 @@ def show(img_path):
     plt.show()
 
 if __name__ == '__main__':
-    show("../static/frames/9b6704fa-2bc7-40e7-b0a5-f41fd9b8b93f_10930288.jpg")
+    show(r"C:\Code\ML\Project\CardVideoSummary\static\frames\1c4e0b13-c22a-4b24-adc9-633ae8148d2c_18047000.jpg")
     print()

+ 5 - 4
app/core/config.py

@@ -1,6 +1,7 @@
 import os
 import socket
 
+
 def get_local_ip():
     """获取本机局域网 IP"""
     try:
@@ -16,7 +17,6 @@ def get_local_ip():
     return ip
 
 
-
 class Settings:
     LOCAL_IP: str = get_local_ip()
     LOCAL_PORT: int = 7721
@@ -40,7 +40,8 @@ class Settings:
     # ==========================================
 
     # HuggingFace 语义分割模型路径 (用于识别手和卡片)
-    VIDEO_SEG_MODEL_DIR: str = r"C:\Code\ML\Model\Card_Seg\segformer_card_hand02_safetensors"
+    # VIDEO_SEG_MODEL_DIR: str = r"C:\Code\ML\Model\Card_Seg\segformer_card_hand02_safetensors"
+    VIDEO_SEG_MODEL_DIR: str = "/home/martin/ML/Model/card_seg/segformer_card_hand02_safetensors"
 
     # 目标时间戳前后的搜索范围 (毫秒) -> 决定了去目标时间戳附近多大范围内寻找最佳帧
     VIDEO_SEARCH_BEFORE_MS: int = int(os.getenv("VIDEO_SEARCH_BEFORE_MS", "1000"))  # 往前找/毫秒
@@ -50,7 +51,7 @@ class Settings:
     VIDEO_ANALYSIS_FPS: float = float(os.getenv("VIDEO_ANALYSIS_FPS", "5.0"))
 
     # 只对综合得分排名前 K 的候选帧进行 OCR 识别 (OCR 比较耗时,没必要每帧都跑)
-    VIDEO_OCR_TOP_K: int = int(os.getenv("VIDEO_OCR_TOP_K", "5"))
+    VIDEO_OCR_TOP_K: int = int(os.getenv("VIDEO_OCR_TOP_K", "15"))
 
     # 目标停留时间 (秒) -> 用来奖励那些在画面中稳定停留的帧 (排除一闪而过的残影)
     VIDEO_DWELL_TARGET_SECONDS: float = float(os.getenv("VIDEO_DWELL_TARGET_SECONDS", "1.2"))
@@ -67,4 +68,4 @@ class Settings:
 settings = Settings()
 
 # 确保图片输出目录存在,避免运行报错
-os.makedirs(settings.FRAMES_DIR, exist_ok=True)
+os.makedirs(settings.FRAMES_DIR, exist_ok=True)

+ 22 - 14
app/services/video_service.py

@@ -6,6 +6,8 @@ from dataclasses import dataclass
 from typing import Any, Optional
 
 import cv2
+import numpy as np
+import difflib
 
 from app.core.config import settings
 from app.core.logger import get_logger
@@ -87,7 +89,9 @@ class VideoService:
         方差越大,说明边缘信息越丰富(越不模糊)。
         """
         gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-        return float(cv2.Laplacian(gray, cv2.CV_64F).var())
+        # 增加高斯模糊,过滤掉反光产生的噪点和高频毛刺
+        blurred = cv2.GaussianBlur(gray, (3, 3), 0)
+        return float(cv2.Laplacian(blurred, cv2.CV_64F).var())
 
     def calculate_weight(self, current_time_ms: int, target_time_ms: int) -> float:
         """利用高斯函数计算时间权重。距离 target_time_ms 越近,返回值越接近 1.0"""
@@ -500,17 +504,17 @@ class VideoService:
         score = 0.0
         ocr_set = set(ocr_tokens)
         for token in expected_tokens:
-            if token in ocr_set:
-                score += 1.0  # 完全命中给 1 分
-                continue
-
-            # 兼容:如果目标 token 是 OCR结果的子串,或者 OCR结果是 token的子串 (例如 "PIKACHU" 匹配出 "PIKACH")
-            partial_match = any(
-                len(other) >= 2 and (token in other or other in token)
-                for other in ocr_set
-            )
-            if partial_match:
-                score += 0.6  # 部分匹配给 0.6 
+            best_ratio = 0.0
+            for other in ocr_tokens:
+                # 计算字符串相似度 (0 到 1)
+                ratio = difflib.SequenceMatcher(None, token, other).ratio()
+                if ratio > best_ratio:
+                    best_ratio = ratio
+
+            if best_ratio > 0.85:
+                score += 1.0  # 相似度极高,视为完全命中
+            elif best_ratio > 0.6:
+                score += 0.6  # 存在一定错别字,给部分分
 
         return min(score / len(expected_tokens), 1.0)
 
@@ -611,8 +615,12 @@ class VideoService:
         if not scoring_candidates:
             scoring_candidates = candidates
 
-        # 找准当前窗口期的相对最大清晰度作为归一化基准
-        max_sharpness = max(candidate.sharpness for candidate in scoring_candidates) if scoring_candidates else 0.0
+        # 改为使用 90 分位数,防止单帧反光噪点拉爆整个分数池
+        if scoring_candidates:
+            sharpnesses = [c.sharpness for c in scoring_candidates]
+            max_sharpness = float(np.percentile(sharpnesses, 90))
+        else:
+            max_sharpness = 0.0
         segmentation_used = any(candidate.segmentation_used for candidate in candidates)
 
         expected = self._build_expected_text(card_output)