BM25 Sparse Embedding

An algorithm similar to TF-IDF for generating sparse embedding. An ideal algorithm for keyword matching in RAG.
BM25
sparse embedding
retireval
keyword matching
Author

Shataxi Dubey

Published

April 26, 2026

import re
from collections import Counter
import math

Tokenize the string

def tokenize_bm25(text: str) -> list[str]:
    """Lowercase and extract word tokens — no subword splitting."""
    return re.findall(r"\b\w+\b", text.lower())
def build_vocab(corpus: list[str]) -> dict[str, int]:
    """Assign a stable integer index to every unique term in the corpus."""
    vocab: dict[str, int] = {}
    for text in corpus:
        for token in tokenize_bm25(text):
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab
def bm25_tf_sparse(text: str, vocab: dict[str, int]) -> tuple[list[int], list[float]]:
    """
    Compute a TF-only sparse vector for one document.
    IDF is intentionally omitted here — in production Qdrant applies IDF at
    query time via SparseVectorParams(modifier=Modifier.IDF).
    For this standalone comparison we apply a simple log-normalised TF instead.
    """
    tokens = tokenize_bm25(text)
    counts = Counter(tokens)
    indices, values = [], []
    for term, tf in counts.items():
        if term in vocab:
            indices.append(vocab[term])
            # log-normalised TF: 1 + log(tf)
            values.append(1.0 + math.log(tf))
    return indices, values
def sparse_cosine_similarity(idx_to_term, indices_a, values_a, indices_b, values_b):
    vec_a = dict(zip(indices_a, values_a))
    vec_b = dict(zip(indices_b, values_b))
    
    # Dot product
    common_indices = set(vec_a.keys()) & set(vec_b.keys())
    print("=====common words=====")
    common_words = [idx_to_term[idx] for idx in common_indices]  
    print(common_words)

    # The dot product between the words common between the two sparse vectors
    dot = sum(vec_a[i] * vec_b[i] for i in common_indices)
    
    # Magnitudes
    mag_a = sum(v ** 2 for v in vec_a.values()) ** 0.5
    mag_b = sum(v ** 2 for v in vec_b.values()) ** 0.5
    
    if mag_a == 0 or mag_b == 0:
        return 0.0
    
    return dot / (mag_a * mag_b)
# jd_req_skills='''● Bachelor’s degree in Education or a related field ● Valid State Teaching License (Texas) ● PRAXIS II Certification'''
# candidate_skills = '''EDUCATION AND LICENSING Jun 2008 - Present PRAXIS II Certification Austin, Texas Sep 2005 - Jun 2008 B.A. in Education Austin, Texas'''

# jd_req_skills = "Certified Kubernetes Administrator"
# candidate_skills = "CKA"

jd_req_skills = "machine learning engineer"
candidate_skills = "Artificial intelligence engineer"

# jd_req_skills = "Experience with Kubernetes orchestration"
# candidate_skills = "Managed AWS infrastructure, deployed 50+ microservices"

corpus = [jd_req_skills, candidate_skills]
vocab = build_vocab(corpus)

print(f"Vocabulary {vocab}")
Vocabulary {'machine': 0, 'learning': 1, 'engineer': 2, 'artificial': 3, 'intelligence': 4}
jd_bm25_indices, jd_bm25_values = bm25_tf_sparse(jd_req_skills, vocab)
candidate_bm25_indices, candidate_bm25_values = bm25_tf_sparse(candidate_skills, vocab)

# Reverse vocab for readable output
idx_to_term = {v: k for k, v in vocab.items()}

print("\n======BM25: JD required skills=====")
print([idx_to_term[i] for i in jd_bm25_indices])

print("======BM25: Candidate skills=====")
print([idx_to_term[i] for i in candidate_bm25_indices])

======BM25: JD required skills=====
['machine', 'learning', 'engineer']
======BM25: Candidate skills=====
['artificial', 'intelligence', 'engineer']
bm25_similarity = sparse_cosine_similarity(
    idx_to_term,
    jd_bm25_indices, jd_bm25_values,
    candidate_bm25_indices, candidate_bm25_values,
)

# Override print inside sparse_cosine_similarity is not possible, so show
# common tokens separately
common_indices = set(jd_bm25_indices) & set(candidate_bm25_indices)
print("======BM25: Common tokens=====")
print([idx_to_term[i] for i in common_indices])

print(f"BM25   similarity : {bm25_similarity:.4f}")
=====common words=====
['engineer']
======BM25: Common tokens=====
['engineer']
BM25   similarity : 0.3333

This algorithm demonstrates how, in an Application Tracking System (ATS) that relies on keyword matching, similarity scores can vary significantly with changes in the vocabulary of both the resume and the job description.

Pro tip: To improve your chances of passing an ATS for a specific role, incorporate the exact wording used in the job description into your resume.