Minimal Python scanner



This content originally appeared on DEV Community and was authored by Vaisakh

import sys, re
from email import policy
from email.parser import BytesParser

def read_eml(path):
    with open(path, 'rb') as f:
        return BytesParser(policy=policy.default).parse(f)

def collect_tokens(msg):
    tokens = []
    # Message-ID (strip < >)
    mid = (msg.get('Message-ID') or '').strip()
    if mid.startswith('<') and mid.endswith('>'):
        mid = mid[1:-1]
    if mid: tokens.append(mid)

    # Conversation / custom IDs (adjust header names you use)
    conv = (msg.get('X-Header-ConversationID') or '').strip()
    if conv: tokens.append(conv)

    # Boundary(s)
    ct = msg.get_content_type()
    cth = msg.get('Content-Type', '')
    m = re.search(r'boundary="?(.*?)"?(\s*;|$)', cth, re.IGNORECASE)
    if m and m.group(1): tokens.append(m.group(1))

    # Attachment filenames (name= / filename=)
    for part in msg.walk():
        disp = part.get('Content-Disposition', '') or ''
        m1 = re.search(r'filename="?(.*?)"?(\s*;|$)', disp, re.IGNORECASE)
        if m1 and m1.group(1): tokens.append(m1.group(1))
        cth = part.get('Content-Type', '') or ''
        m2 = re.search(r'name="?(.*?)"?(\s*;|$)', cth, re.IGNORECASE)
        if m2 and m2.group(1): tokens.append(m2.group(1))
    # Dedup
    seen, uniq = set(), []
    for t in tokens:
        if t and t not in seen:
            uniq.append(t); seen.add(t)
    return uniq

def decode_part(part):
    payload = part.get_payload(decode=True)
    if payload is None:
        return None, None
    ctype = part.get_content_type().lower()
    charset = part.get_content_charset() or 'utf-8'
    if ctype.startswith('text/') or ctype in ('application/xml', 'application/json'):
        try:
            return 'text', payload.decode(charset, errors='ignore')
        except Exception:
            return 'bytes', payload
    else:
        return 'bytes', payload

def scan(path):
    msg = read_eml(path)
    tokens = collect_tokens(msg)
    print("Tokens to scan for:", tokens)

    hit_any = False
    idx = 0
    for part in msg.walk():
        kind, data = decode_part(part)
        if data is None: 
            idx += 1; continue
        for tok in tokens:
            if not tok: 
                continue
            if kind == 'text':
                pos = data.find(tok)
                if pos != -1:
                    start = max(0, pos-40); end = min(len(data), pos+40)
                    print(f"[HIT] part#{idx} {part.get_content_type()} contains '{tok}' → ...{data[start:end]}...")
                    hit_any = True
            else:
                if tok.encode('ascii', 'ignore') in data:
                    print(f"[HIT] part#{idx} {part.get_content_type()} (binary) contains token '{tok}'")
                    hit_any = True
        idx += 1
    if not hit_any:
        print("No tokens found inside decoded parts.")

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python scan_eml_tokens.py <file.eml>")
        sys.exit(1)
    scan(sys.argv[1])

How to use

python scan_eml_tokens.py path/to/sample.eml

It prints all header-derived tokens and reports any occurrence inside decoded parts.

If it finds none, you’re safe to only change headers/filenames/boundaries in the generator.


This content originally appeared on DEV Community and was authored by Vaisakh