前言
你有沒有注意到,很多技術部落格的文章底部都會有「相關文章推薦」的區塊?大部分的實作方式是靠標籤(tag)比對 — 如果兩篇文章有相同的標籤,就算「相關」。但這種方法很粗糙,而且完全依賴人工打標籤。
用 AI Embedding 可以做得更好。把每篇文章轉成向量,然後計算向量之間的距離,距離越近代表內容越相似。不需要標籤、不需要人工判斷,AI 自己就能找出語意上相近的文章。
這篇文章會從 Embedding 的原理開始,帶你用 PostgreSQL + pgvector 實作一個完整的相似文章推薦系統。
Embedding 原理
什麼是 Embedding?
Embedding 就是把「東西」(文字、圖片、音訊)轉成一個固定長度的數字向量。這個向量捕捉了「東西」的語意特徵。
"Docker 容器化部署" → [0.12, -0.45, 0.78, ..., 0.33] (384 個數字)
"Kubernetes 叢集管理" → [0.15, -0.42, 0.71, ..., 0.29] (384 個數字)
"今天天氣很好" → [-0.56, 0.23, -0.11, ..., 0.87] (384 個數字)
前兩個向量會很「接近」(因為都是容器相關的技術),第三個向量會離前兩個很「遠」。
Cosine Similarity
衡量兩個向量有多「像」,最常用的方法是 Cosine Similarity:
import numpy as np
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""
計算兩個向量的餘弦相似度
回傳值範圍: -1 到 1
1 = 完全相同方向
0 = 完全無關
-1 = 完全相反
"""
dot_product = np.dot(a, b)
norm_a = np.linalg.norm(a)
norm_b = np.linalg.norm(b)
return dot_product / (norm_a * norm_b)
# 範例
vec_docker = np.array([0.12, -0.45, 0.78, 0.33])
vec_k8s = np.array([0.15, -0.42, 0.71, 0.29])
vec_weather = np.array([-0.56, 0.23, -0.11, 0.87])
print(f"Docker vs K8s: {cosine_similarity(vec_docker, vec_k8s):.4f}")
# 大約 0.98 — 非常相似
print(f"Docker vs Weather: {cosine_similarity(vec_docker, vec_weather):.4f}")
# 大約 -0.30 — 不相關
選擇 Embedding 模型
# 常見的 Embedding 模型比較
models = {
"all-MiniLM-L6-v2": {
"維度": 384,
"速度": "很快",
"品質": "中等",
"中文": "普通",
"來源": "開源 (sentence-transformers)",
},
"bge-large-zh-v1.5": {
"維度": 1024,
"速度": "中等",
"品質": "很好",
"中文": "優秀",
"來源": "開源 (BAAI)",
},
"text-embedding-3-small": {
"維度": 1536,
"速度": "取決於網路",
"品質": "很好",
"中文": "好",
"來源": "OpenAI API (付費)",
},
"nomic-embed-text": {
"維度": 768,
"速度": "快 (本地)",
"品質": "好",
"中文": "中等",
"來源": "Ollama (本地免費)",
},
}
PostgreSQL + pgvector 設定
安裝
# docker-compose.yml
services:
postgres:
image: pgvector/pgvector:pg16
container_name: pgvector-db
restart: always
ports:
- "5432:5432"
environment:
POSTGRES_DB: blog
POSTGRES_USER: blogger
POSTGRES_PASSWORD: secure-password
volumes:
- pgdata:/var/lib/postgresql/data
- ./init.sql:/docker-entrypoint-initdb.d/init.sql
volumes:
pgdata:
-- init.sql:初始化資料庫
CREATE EXTENSION IF NOT EXISTS vector;
CREATE TABLE IF NOT EXISTS articles (
id SERIAL PRIMARY KEY,
title VARCHAR(500) NOT NULL,
slug VARCHAR(500) UNIQUE NOT NULL,
content TEXT NOT NULL,
summary TEXT,
category VARCHAR(100),
tags TEXT[],
published_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);
-- Embedding 表(分開存放,方便重建)
CREATE TABLE IF NOT EXISTS article_embeddings (
id SERIAL PRIMARY KEY,
article_id INTEGER REFERENCES articles(id) ON DELETE CASCADE,
embedding vector(384), -- 維度要跟模型一致
model_name VARCHAR(100) DEFAULT 'all-MiniLM-L6-v2',
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
UNIQUE(article_id, model_name)
);
-- 建立向量索引(IVFFlat,適合中等資料量)
CREATE INDEX IF NOT EXISTS article_embeddings_idx
ON article_embeddings
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 50);
-- 如果文章數量很大(>100k),改用 HNSW 索引
-- CREATE INDEX article_embeddings_hnsw_idx
-- ON article_embeddings
-- USING hnsw (embedding vector_cosine_ops)
-- WITH (m = 16, ef_construction = 64);
# 啟動
docker compose up -d
# 確認 pgvector 已安裝
docker compose exec postgres psql -U blogger -d blog -c "SELECT extversion FROM pg_extension WHERE extname = 'vector';"
完整的推薦系統實作
核心類別
# recommender.py
import psycopg2
from pgvector.psycopg2 import register_vector
import numpy as np
from sentence_transformers import SentenceTransformer
from dataclasses import dataclass
from typing import Optional
@dataclass
class Article:
id: int
title: str
slug: str
content: str
summary: Optional[str] = None
category: Optional[str] = None
tags: Optional[list[str]] = None
@dataclass
class Recommendation:
article: Article
similarity_score: float
class ArticleRecommender:
def __init__(
self,
db_url: str = "postgresql://blogger:secure-password@localhost:5432/blog",
model_name: str = "all-MiniLM-L6-v2",
):
self.db_url = db_url
self.model_name = model_name
self.model = SentenceTransformer(model_name)
self.conn = psycopg2.connect(db_url)
register_vector(self.conn)
def _get_embedding_text(self, article: Article) -> str:
"""組裝用於 embedding 的文字"""
# 標題的權重比較高,所以重複出現
parts = [
article.title,
article.title, # 重複一次增加權重
article.summary or "",
article.content[:1000], # 只取前 1000 字,避免太長
]
if article.tags:
parts.append(" ".join(article.tags))
if article.category:
parts.append(article.category)
return " ".join(parts)
def generate_embedding(self, text: str) -> np.ndarray:
"""生成 embedding 向量"""
return self.model.encode(
text,
normalize_embeddings=True, # L2 正規化
).astype(np.float32)
def index_article(self, article: Article):
"""為一篇文章建立 embedding 索引"""
text = self._get_embedding_text(article)
embedding = self.generate_embedding(text)
with self.conn.cursor() as cur:
cur.execute(
"""INSERT INTO article_embeddings (article_id, embedding, model_name)
VALUES (%s, %s, %s)
ON CONFLICT (article_id, model_name)
DO UPDATE SET embedding = EXCLUDED.embedding,
created_at = NOW()""",
(article.id, embedding.tolist(), self.model_name)
)
self.conn.commit()
def index_all_articles(self):
"""為所有文章建立索引"""
with self.conn.cursor() as cur:
cur.execute(
"SELECT id, title, slug, content, summary, category, tags FROM articles"
)
articles = []
for row in cur.fetchall():
articles.append(Article(
id=row[0], title=row[1], slug=row[2],
content=row[3], summary=row[4],
category=row[5], tags=row[6],
))
print(f"正在為 {len(articles)} 篇文章建立索引...")
for i, article in enumerate(articles):
self.index_article(article)
if (i + 1) % 10 == 0:
print(f" 進度: {i + 1}/{len(articles)}")
print("索引建立完成!")
def get_similar_articles(
self,
article_id: int,
limit: int = 5,
min_similarity: float = 0.3,
) -> list[Recommendation]:
"""取得相似文章"""
with self.conn.cursor() as cur:
# 先取得目標文章的 embedding
cur.execute(
"""SELECT embedding
FROM article_embeddings
WHERE article_id = %s AND model_name = %s""",
(article_id, self.model_name)
)
row = cur.fetchone()
if not row:
return []
target_embedding = row[0]
# 搜尋相似的文章(排除自己)
cur.execute(
"""SELECT
a.id, a.title, a.slug, a.content, a.summary,
a.category, a.tags,
1 - (ae.embedding <=> %s::vector) AS similarity
FROM article_embeddings ae
JOIN articles a ON a.id = ae.article_id
WHERE ae.article_id != %s
AND ae.model_name = %s
AND 1 - (ae.embedding <=> %s::vector) > %s
ORDER BY ae.embedding <=> %s::vector
LIMIT %s""",
(
target_embedding, article_id, self.model_name,
target_embedding, min_similarity,
target_embedding, limit,
)
)
results = []
for row in cur.fetchall():
article = Article(
id=row[0], title=row[1], slug=row[2],
content=row[3], summary=row[4],
category=row[5], tags=row[6],
)
results.append(Recommendation(
article=article,
similarity_score=float(row[7]),
))
return results
def search_by_text(
self,
query: str,
limit: int = 10,
) -> list[Recommendation]:
"""用自然語言搜尋文章"""
query_embedding = self.generate_embedding(query)
with self.conn.cursor() as cur:
cur.execute(
"""SELECT
a.id, a.title, a.slug, a.content, a.summary,
a.category, a.tags,
1 - (ae.embedding <=> %s::vector) AS similarity
FROM article_embeddings ae
JOIN articles a ON a.id = ae.article_id
WHERE ae.model_name = %s
ORDER BY ae.embedding <=> %s::vector
LIMIT %s""",
(query_embedding.tolist(), self.model_name,
query_embedding.tolist(), limit)
)
results = []
for row in cur.fetchall():
article = Article(
id=row[0], title=row[1], slug=row[2],
content=row[3], summary=row[4],
category=row[5], tags=row[6],
)
results.append(Recommendation(
article=article,
similarity_score=float(row[7]),
))
return results
FastAPI 整合
# api.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
app = FastAPI(title="Article Recommendation API")
recommender = ArticleRecommender()
class RecommendationResponse(BaseModel):
article_id: int
title: str
slug: str
similarity: float
class SearchRequest(BaseModel):
query: str
limit: int = 10
@app.get("/articles/{article_id}/similar")
async def get_similar(article_id: int, limit: int = 5):
"""取得相似文章推薦"""
results = recommender.get_similar_articles(article_id, limit=limit)
if not results:
raise HTTPException(404, "找不到文章或尚未建立索引")
return [
RecommendationResponse(
article_id=r.article.id,
title=r.article.title,
slug=r.article.slug,
similarity=round(r.similarity_score, 4),
)
for r in results
]
@app.post("/search")
async def semantic_search(request: SearchRequest):
"""語意搜尋文章"""
results = recommender.search_by_text(request.query, limit=request.limit)
return [
RecommendationResponse(
article_id=r.article.id,
title=r.article.title,
slug=r.article.slug,
similarity=round(r.similarity_score, 4),
)
for r in results
]
@app.post("/index/rebuild")
async def rebuild_index():
"""重建所有文章的索引"""
recommender.index_all_articles()
return {"message": "索引重建完成"}
# 測試 API
# 取得相似文章
curl http://localhost:8000/articles/1/similar?limit=5
# 語意搜尋
curl -X POST http://localhost:8000/search \
-H "Content-Type: application/json" \
-d '{"query": "如何用 Docker 部署應用程式", "limit": 5}'
WordPress 整合
<?php
// functions.php — 在 WordPress 中呼叫推薦 API
function get_similar_articles($post_id, $limit = 5) {
$api_url = "http://localhost:8000/articles/{$post_id}/similar?limit={$limit}";
$response = wp_remote_get($api_url, array(
'timeout' => 5,
));
if (is_wp_error($response)) {
return array();
}
$body = wp_remote_retrieve_body($response);
return json_decode($body, true);
}
// 在 single.php 中使用
function display_similar_articles() {
global $post;
$similar = get_similar_articles($post->ID);
if (empty($similar)) {
return;
}
echo '<div class="similar-articles">';
echo '<h3>相關文章</h3>';
echo '<ul>';
foreach ($similar as $item) {
$permalink = home_url('/' . $item['slug'] . '/');
$similarity = round($item['similarity'] * 100);
echo "<li>";
echo "<a href=\"{$permalink}\">{$item['title']}</a>";
echo "<span class=\"similarity\">{$similarity}% 相關</span>";
echo "</li>";
}
echo '</ul>';
echo '</div>';
}
進階優化
1. 增量更新
def on_article_updated(self, article_id: int):
"""文章更新時重新計算 embedding"""
with self.conn.cursor() as cur:
cur.execute(
"SELECT id, title, slug, content, summary, category, tags "
"FROM articles WHERE id = %s",
(article_id,)
)
row = cur.fetchone()
if row:
article = Article(
id=row[0], title=row[1], slug=row[2],
content=row[3], summary=row[4],
category=row[5], tags=row[6],
)
self.index_article(article)
print(f"已更新文章 #{article_id} 的 embedding")
2. 快取推薦結果
import json
import hashlib
from functools import lru_cache
class CachedRecommender(ArticleRecommender):
def __init__(self, *args, redis_url="redis://localhost:6379", **kwargs):
super().__init__(*args, **kwargs)
import redis
self.redis = redis.from_url(redis_url)
self.cache_ttl = 3600 # 1 小時
def get_similar_articles(self, article_id, limit=5, min_similarity=0.3):
cache_key = f"similar:{article_id}:{limit}:{min_similarity}"
cached = self.redis.get(cache_key)
if cached:
return json.loads(cached)
results = super().get_similar_articles(article_id, limit, min_similarity)
# 序列化並快取
serialized = json.dumps([
{
"article_id": r.article.id,
"title": r.article.title,
"slug": r.article.slug,
"similarity": r.similarity_score,
}
for r in results
], ensure_ascii=False)
self.redis.setex(cache_key, self.cache_ttl, serialized)
return results
def invalidate_cache(self, article_id: int):
"""清除跟這篇文章相關的快取"""
# 清除這篇文章自己的推薦快取
pattern = f"similar:{article_id}:*"
for key in self.redis.scan_iter(pattern):
self.redis.delete(key)
# 也要清除其他文章中可能包含這篇文章的推薦
# 簡單做法:清除所有推薦快取
for key in self.redis.scan_iter("similar:*"):
self.redis.delete(key)
3. 混合推薦(Embedding + 標籤)
def hybrid_recommend(
self,
article_id: int,
limit: int = 5,
embedding_weight: float = 0.7,
tag_weight: float = 0.3,
) -> list[Recommendation]:
"""結合 embedding 相似度和標籤重疊度的混合推薦"""
# 取得 embedding 推薦
embedding_results = self.get_similar_articles(article_id, limit=limit * 2)
# 取得目標文章的標籤
with self.conn.cursor() as cur:
cur.execute("SELECT tags FROM articles WHERE id = %s", (article_id,))
target_tags = set(cur.fetchone()[0] or [])
# 計算混合分數
for result in embedding_results:
# 標籤重疊度 (Jaccard similarity)
result_tags = set(result.article.tags or [])
if target_tags or result_tags:
tag_sim = len(target_tags & result_tags) / len(target_tags | result_tags)
else:
tag_sim = 0.0
# 混合分數
result.similarity_score = (
embedding_weight * result.similarity_score +
tag_weight * tag_sim
)
# 重新排序
embedding_results.sort(key=lambda x: x.similarity_score, reverse=True)
return embedding_results[:limit]
效能考量
文章數量 vs 查詢速度(pgvector IVFFlat 索引):
| 文章數 | 查詢時間 | 記憶體使用 (384維) |
|--------|----------|-------------------|
| 100 | < 1ms | ~150 KB |
| 1,000 | < 5ms | ~1.5 MB |
| 10,000 | < 20ms | ~15 MB |
| 100,000| < 50ms | ~150 MB |
| 1M | < 200ms | ~1.5 GB |
對於個人部落格(通常 < 1000 篇文章),效能完全不是問題。
即使不用索引,暴力搜尋都夠快。
# 效能測試腳本
import time
def benchmark_search(recommender, article_id, iterations=100):
"""測量搜尋效能"""
times = []
for _ in range(iterations):
start = time.perf_counter()
recommender.get_similar_articles(article_id, limit=5)
elapsed = time.perf_counter() - start
times.append(elapsed * 1000) # 轉毫秒
print(f"平均查詢時間: {np.mean(times):.2f} ms")
print(f"P50: {np.percentile(times, 50):.2f} ms")
print(f"P95: {np.percentile(times, 95):.2f} ms")
print(f"P99: {np.percentile(times, 99):.2f} ms")
小結
用 AI Embedding 做相似文章推薦,比傳統的標籤比對更聰明、更省人力。整個系統的核心其實很簡單:
- 選一個 Embedding 模型(入門用
all-MiniLM-L6-v2,中文用bge-large-zh-v1.5) - 把文章轉成向量存進 pgvector
- 查詢時算 cosine similarity 排序
就這樣,三步搞定。
如果你的部落格是 WordPress,像我一樣另外跑一個 Python API 來處理推薦,透過 wp_remote_get 呼叫,整合起來並不複雜。
後續可以探索的方向:
- 多模態 Embedding:不只看文字,也把文章中的圖片納入考慮
- 用戶行為加權:把閱讀歷史、點擊率加入推薦公式
- Cross-Encoder Re-ranking:先粗篩再精排,提升推薦品質
- 其他向量資料庫:Milvus、Qdrant、Weaviate 等,適合更大規模的場景
延伸閱讀:
- pgvector 官方文件:https://github.com/pgvector/pgvector
- MTEB Embedding 模型排行榜:https://huggingface.co/spaces/mteb/leaderboard
- Sentence Transformers 文件:https://www.sbert.net
- 推薦系統設計模式:Collaborative Filtering vs Content-Based