66 lines
1.8 KiB
Python
66 lines
1.8 KiB
Python
import logging
|
|
import re
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_SENSITIVE_WORDS: list[str] = []
|
|
_PATTERN: re.Pattern | None = None
|
|
|
|
|
|
def _load_words():
|
|
global _SENSITIVE_WORDS, _PATTERN
|
|
words_file = Path(__file__).parent.parent.parent / "sensitive_words.txt"
|
|
if words_file.exists():
|
|
raw = words_file.read_text(encoding="utf-8")
|
|
_SENSITIVE_WORDS = [w.strip() for w in raw.splitlines() if w.strip()]
|
|
else:
|
|
_SENSITIVE_WORDS = []
|
|
|
|
if _SENSITIVE_WORDS:
|
|
escaped = [re.escape(w) for w in _SENSITIVE_WORDS]
|
|
_PATTERN = re.compile("|".join(escaped), re.IGNORECASE)
|
|
else:
|
|
_PATTERN = None
|
|
|
|
|
|
_load_words()
|
|
|
|
|
|
def reload_sensitive_words():
|
|
"""Hot-reload the sensitive word list from disk."""
|
|
_load_words()
|
|
logger.info("Reloaded %d sensitive words", len(_SENSITIVE_WORDS))
|
|
|
|
|
|
def check_text(text: str) -> dict:
|
|
"""
|
|
Check text against the sensitive word list.
|
|
Returns {"safe": True/False, "matched": [...]}
|
|
"""
|
|
if not text or _PATTERN is None:
|
|
return {"safe": True, "matched": []}
|
|
|
|
matches = _PATTERN.findall(text)
|
|
if matches:
|
|
unique = list(set(matches))
|
|
return {"safe": False, "matched": unique}
|
|
return {"safe": True, "matched": []}
|
|
|
|
|
|
def filter_text(text: str, replacement: str = "**") -> str:
|
|
"""Replace sensitive words with the replacement string."""
|
|
if not text or _PATTERN is None:
|
|
return text
|
|
return _PATTERN.sub(replacement, text)
|
|
|
|
|
|
async def check_image_safety(image_url: str) -> dict:
|
|
"""
|
|
Placeholder for third-party image audit.
|
|
In production, integrate with Tencent Cloud / Aliyun content moderation API.
|
|
Returns {"safe": True/False, "labels": [...]}
|
|
"""
|
|
logger.debug("Image safety check (placeholder): %s", image_url)
|
|
return {"safe": True, "labels": []}
|