diff --git a/python/dolma/taggers/code/code_taggers.py b/python/dolma/taggers/code/code_taggers.py index 31b57087..e0ddbd4e 100644 --- a/python/dolma/taggers/code/code_taggers.py +++ b/python/dolma/taggers/code/code_taggers.py @@ -111,10 +111,8 @@ def _extract_copyright_spans(self, text: str) -> List[Span]: for line in lines: if line.startswith("//") or line.startswith("#") or line.startswith("--") or not line: skip = skip + 1 - if not line: - end += 1 - else: - end += len(line) + # +1 accounts for the "\n" separator that str.split dropped + end += len(line) + 1 else: break