This content originally appeared on DEV Community and was authored by Vaisakh
import sys, re
from email import policy
from email.parser import BytesParser
def read_eml(path):
with open(path, 'rb') as f:
return BytesParser(policy=policy.default).parse(f)
def collect_tokens(msg):
tokens = []
# Message-ID (strip < >)
mid = (msg.get('Message-ID') or '').strip()
if mid.startswith('<') and mid.endswith('>'):
mid = mid[1:-1]
if mid: tokens.append(mid)
# Conversation / custom IDs (adjust header names you use)
conv = (msg.get('X-Header-ConversationID') or '').strip()
if conv: tokens.append(conv)
# Boundary(s)
ct = msg.get_content_type()
cth = msg.get('Content-Type', '')
m = re.search(r'boundary="?(.*?)"?(\s*;|$)', cth, re.IGNORECASE)
if m and m.group(1): tokens.append(m.group(1))
# Attachment filenames (name= / filename=)
for part in msg.walk():
disp = part.get('Content-Disposition', '') or ''
m1 = re.search(r'filename="?(.*?)"?(\s*;|$)', disp, re.IGNORECASE)
if m1 and m1.group(1): tokens.append(m1.group(1))
cth = part.get('Content-Type', '') or ''
m2 = re.search(r'name="?(.*?)"?(\s*;|$)', cth, re.IGNORECASE)
if m2 and m2.group(1): tokens.append(m2.group(1))
# Dedup
seen, uniq = set(), []
for t in tokens:
if t and t not in seen:
uniq.append(t); seen.add(t)
return uniq
def decode_part(part):
payload = part.get_payload(decode=True)
if payload is None:
return None, None
ctype = part.get_content_type().lower()
charset = part.get_content_charset() or 'utf-8'
if ctype.startswith('text/') or ctype in ('application/xml', 'application/json'):
try:
return 'text', payload.decode(charset, errors='ignore')
except Exception:
return 'bytes', payload
else:
return 'bytes', payload
def scan(path):
msg = read_eml(path)
tokens = collect_tokens(msg)
print("Tokens to scan for:", tokens)
hit_any = False
idx = 0
for part in msg.walk():
kind, data = decode_part(part)
if data is None:
idx += 1; continue
for tok in tokens:
if not tok:
continue
if kind == 'text':
pos = data.find(tok)
if pos != -1:
start = max(0, pos-40); end = min(len(data), pos+40)
print(f"[HIT] part#{idx} {part.get_content_type()} contains '{tok}' → ...{data[start:end]}...")
hit_any = True
else:
if tok.encode('ascii', 'ignore') in data:
print(f"[HIT] part#{idx} {part.get_content_type()} (binary) contains token '{tok}'")
hit_any = True
idx += 1
if not hit_any:
print("No tokens found inside decoded parts.")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python scan_eml_tokens.py <file.eml>")
sys.exit(1)
scan(sys.argv[1])
How to use
python scan_eml_tokens.py path/to/sample.eml
It prints all header-derived tokens and reports any occurrence inside decoded parts.
If it finds none, you’re safe to only change headers/filenames/boundaries in the generator.
This content originally appeared on DEV Community and was authored by Vaisakh