Anonymous View
Skip to content

_remote_debugging frame cache can reuse stale frame anchors #151613

@pablogsal

Description

@pablogsal

The remote debugging frame cache uses last_profiled_frame as an address anchor, so it can hit an ABA case when a frame address is reused after the old frame has returned. A profiler sample can then validate against the same address but append cached parent frames from a previous stack, producing impossible mixed stacks such as a b_leaf frame with an a_parent caller. PR #151437 fixes missing anchor updates, but the cache should also distinguish reused frame addresses, for example by pairing the anchor with a monotonic sequence, so cached continuations are only reused for the exact frame instance previously sampled.

I confirmed the following standalone reproducer. It repeatedly samples a process that alternates between two separate call chains. Seeing names from both chains in one sampled stack is impossible unless cached parent frames were spliced onto the wrong live frame.

import contextlib
import os
import socket
import subprocess
import sys
import tempfile
import textwrap

from _remote_debugging import PROCESS_VM_READV_SUPPORTED, RemoteUnwinder

TRANSIENT_ERRORS = (OSError, RuntimeError, UnicodeDecodeError)

if sys.platform != "linux" or not PROCESS_VM_READV_SUPPORTED:
    raise SystemExit("requires Linux with process_vm_readv support")


def find_free_port():
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
        sock.bind(("localhost", 0))
        return sock.getsockname()[1]


def wait_for(sock, expected):
    sock.settimeout(10.0)
    data = b""
    while expected not in data:
        chunk = sock.recv(4096)
        if not chunk:
            raise RuntimeError(f"target exited before {expected!r}")
        data += chunk


port = find_free_port()
target = f"""\
import socket

sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.connect(("localhost", {port}))
sock.sendall(b"ready")

def burn_a():
    total = 0
    for i in range(20000):
        total += i
    return total

def burn_b():
    total = 0
    for i in range(20000):
        total += i
    return total

def a_leaf():
    return burn_a()

def b_leaf():
    return burn_b()

def a_parent():
    return a_leaf()

def b_parent():
    return b_leaf()

while True:
    a_parent()
    b_parent()
"""

with tempfile.TemporaryDirectory() as tmp:
    script = os.path.join(tmp, "target.py")
    with open(script, "w", encoding="utf-8") as f:
        f.write(textwrap.dedent(target))

    server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    server.bind(("localhost", port))
    server.listen(1)
    server.settimeout(10.0)

    proc = subprocess.Popen([sys.executable, script])
    conn = None
    try:
        conn, _ = server.accept()
        wait_for(conn, b"ready")
        unwinder = RemoteUnwinder(
            proc.pid, all_threads=True, cache_frames=True, stats=True
        )
        branch_a = {"a_parent", "a_leaf", "burn_a"}
        branch_b = {"b_parent", "b_leaf", "burn_b"}

        for sample in range(1, 8001):
            with contextlib.suppress(*TRANSIENT_ERRORS):
                traces = unwinder.get_stack_trace()
                for interp in traces:
                    for thread in interp.threads:
                        funcs = [frame.funcname for frame in thread.frame_info]
                        names = set(funcs)
                        if branch_a & names and branch_b & names:
                            print(f"mixed stack found at sample {sample}")
                            print(funcs)
                            print(unwinder.get_stats())
                            raise SystemExit(1)

        print("no mixed stack found")
        print(unwinder.get_stats())
    finally:
        if conn is not None:
            conn.close()
        server.close()
        proc.kill()
        proc.wait(timeout=10.0)

On the unfixed build, this reproduced for me with this output:

mixed stack found at sample 5904
['b_leaf', 'a_parent', '<module>']
{'total_samples': 5904, 'frame_cache_hits': 5887, 'frame_cache_misses': 13, 'frame_cache_partial_hits': 3, 'frames_read_from_cache': 17666, 'frames_read_from_memory': 5942, 'memory_reads': 11889, 'memory_bytes_read': 5722200, 'code_object_cache_hits': 5947, 'code_object_cache_misses': 7, 'stale_cache_invalidations': 0, 'batched_read_attempts': 5903, 'batched_read_successes': 5903, 'batched_read_misses': 0, 'batched_read_segments_requested': 17709, 'batched_read_segments_completed': 17709, 'frame_cache_hit_rate': 99.7797729967813, 'code_object_cache_hit_rate': 99.88243197850184, 'batched_read_success_rate': 100.0, 'batched_read_segment_completion_rate': 100.0}

Linked PRs

Metadata

Metadata

Assignees

No one assigned

    Labels

    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions