From 96f8b0770adfc1c5320b5cded220fbc0586d442c Mon Sep 17 00:00:00 2001 From: Chessing234 Date: Fri, 17 Apr 2026 16:41:46 +0530 Subject: [PATCH] Fix ft_dataset write_results dropping every other message and opening in binary mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `write_results` reads a message off the queue, checks it against the exit sentinel, and is supposed to write it out: while True: msg = q.get() if msg == _WRITER_EXIT_MSG: break if not flag.is_set(): o.write(q.get()) # <-- second q.get(): discards msg, pulls next o.write("\n") written += 1 Two bugs in five lines: 1. `o.write(q.get())` calls `q.get()` a second time, so the `msg` already retrieved at the top of the loop is never written — it was only used for the exit check. Half of the producer's messages are dropped, and the write itself can even pull and write the `_WRITER_EXIT_MSG` sentinel into the output file. 2. The output file is opened with mode `"wb"` (binary), but the producer puts `str` values on the queue and line 112 writes the literal `"\n"`. Both calls raise `TypeError: a bytes-like object is required, not 'str'` the moment anything is produced. Write `msg` instead of re-polling the queue, and open the path in text mode (`"w"`) to match the `str` payloads produced by `process_file`. --- python/dolma/core/ft_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/dolma/core/ft_dataset.py b/python/dolma/core/ft_dataset.py index 40a63d7e..8ecbdf49 100644 --- a/python/dolma/core/ft_dataset.py +++ b/python/dolma/core/ft_dataset.py @@ -100,7 +100,7 @@ def process_file(config: Config, q: "Queue[str]", flag: Event, label: str, fn): def write_results(config: Config, q: "Queue[str]", flag: Event): written = 0 - with smart_open.open(config.out_path, "wb") as o: + with smart_open.open(config.out_path, "w") as o: while True: msg = q.get() @@ -108,7 +108,7 @@ def write_results(config: Config, q: "Queue[str]", flag: Event): break if not flag.is_set(): - o.write(q.get()) + o.write(msg) o.write("\n") written += 1