Fix bug w/ use_text_flow=True extractions (#1279)

jsvine · jsvine · commit e15ed98a2677 · 2025-03-27T22:44:25.000-04:00
... related to flows where text bounces between lines. h/t @samuelbradshaw
diff --git a/pdfplumber/utils/text.py b/pdfplumber/utils/text.py
@@ -587,7 +587,7 @@ def char_begins_new_word(
             (cx < ax)
             or (cx > bx + x)
             # Interline test
-            or (cy > ay + y)
+            or abs(cy - ay) > y
         )
 
     def iter_chars_to_words(
diff --git a/tests/pdfs/issue-1279-example.pdf b/tests/pdfs/issue-1279-example.pdf
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -336,6 +336,19 @@ def test_text_flow_overlapping(self):
             re.search("2015 RICE PAYMENT 26406576 0 1207631 Cr", not_using_flow) is None
         )
 
+    def test_text_flow_words_mixed_lines(self):
+        path = os.path.join(HERE, "pdfs/issue-1279-example.pdf")
+
+        with pdfplumber.open(path) as pdf:
+            p0 = pdf.pages[0]
+            words = p0.extract_words(use_text_flow=True)
+
+        texts = set(w["text"] for w in words)
+
+        assert "claim" in texts
+        assert "lence" in texts
+        assert "claimlence" not in texts
+
     def test_extract_text(self):
         text = self.pdf.pages[0].extract_text()
         goal_lines = [