Skip to content

Commit e15ed98

Browse files
committed
Fix bug w/ use_text_flow=True extractions (#1279)
... related to flows where text bounces between lines. h/t @samuelbradshaw
1 parent f2ad942 commit e15ed98

File tree

3 files changed

+14
-1
lines changed

3 files changed

+14
-1
lines changed

pdfplumber/utils/text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -587,7 +587,7 @@ def char_begins_new_word(
587587
(cx < ax)
588588
or (cx > bx + x)
589589
# Interline test
590-
or (cy > ay + y)
590+
or abs(cy - ay) > y
591591
)
592592

593593
def iter_chars_to_words(

tests/pdfs/issue-1279-example.pdf

36.3 KB
Binary file not shown.

tests/test_utils.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,19 @@ def test_text_flow_overlapping(self):
336336
re.search("2015 RICE PAYMENT 26406576 0 1207631 Cr", not_using_flow) is None
337337
)
338338

339+
def test_text_flow_words_mixed_lines(self):
340+
path = os.path.join(HERE, "pdfs/issue-1279-example.pdf")
341+
342+
with pdfplumber.open(path) as pdf:
343+
p0 = pdf.pages[0]
344+
words = p0.extract_words(use_text_flow=True)
345+
346+
texts = set(w["text"] for w in words)
347+
348+
assert "claim" in texts
349+
assert "lence" in texts
350+
assert "claimlence" not in texts
351+
339352
def test_extract_text(self):
340353
text = self.pdf.pages[0].extract_text()
341354
goal_lines = [

0 commit comments

Comments
 (0)