Skip to content

Commit 43ccc5b

Browse files
committed
Catch exceptions from pdfminer and malformed PDFs
... thanks to OSS-Fuzz and @ennamarie19 Cf.: google/oss-fuzz#12949
1 parent a77808a commit 43ccc5b

22 files changed

+85
-6
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes
570570
- [@wodny](https://github.com/wodny)
571571
- [Michal Stolarczyk](https://github.com/stolarczyk)
572572
- [Brandon Roberts](https://github.com/brandonrobertz)
573+
- [@ennamarie19](https://github.com/ennamarie19)
573574

574575
## Contributing
575576

pdfplumber/display.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from . import utils
1010
from ._typing import T_bbox, T_num, T_obj, T_obj_list, T_point, T_seq
1111
from .table import T_table_settings, Table, TableFinder, TableSettings
12+
from .utils.exceptions import MalformedPDFException
1213

1314
if TYPE_CHECKING: # pragma: nocover
1415
import pandas as pd
@@ -52,7 +53,11 @@ def get_page_image(
5253
stream.seek(0)
5354
src = stream
5455

55-
pdfium_doc = pypdfium2.PdfDocument(src, password=password)
56+
try:
57+
pdfium_doc = pypdfium2.PdfDocument(src, password=password)
58+
except pypdfium2._helpers.misc.PdfiumError as e:
59+
raise MalformedPDFException(e)
60+
5661
pdfium_page = pdfium_doc.get_page(page_ix)
5762

5863
img: PIL.Image.Image = pdfium_page.render(

pdfplumber/page.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import numbers
12
import re
23
from functools import lru_cache
34
from typing import (
@@ -35,6 +36,7 @@
3536
from .structure import PDFStructTree, StructTreeMissing
3637
from .table import T_table_settings, Table, TableFinder, TableSettings
3738
from .utils import decode_text, resolve_all, resolve_and_decode
39+
from .utils.exceptions import MalformedPDFException, PdfminerException
3840
from .utils.text import TextMap
3941

4042
lt_pat = re.compile(r"^LT")
@@ -184,6 +186,10 @@ def _normalize_box(box_raw: T_bbox, rotation: T_num = 0) -> T_bbox:
184186
# conventionally specified by their lower-left and upperright
185187
# corners, it is acceptable to specify any two diagonally opposite
186188
# corners."
189+
if not all(isinstance(x, numbers.Number) for x in box_raw):
190+
raise MalformedPDFException(
191+
f"Bounding box contains non-number coordinate(s): {box_raw}"
192+
)
187193
x0, x1 = sorted((box_raw[0], box_raw[2]))
188194
y0, y1 = sorted((box_raw[1], box_raw[3]))
189195
if rotation in [90, 270]:
@@ -276,7 +282,10 @@ def layout(self) -> LTPage:
276282
laparams=self.pdf.laparams,
277283
)
278284
interpreter = PDFPageInterpreter(self.pdf.rsrcmgr, device)
279-
interpreter.process_page(self.page_obj)
285+
try:
286+
interpreter.process_page(self.page_obj)
287+
except Exception as e:
288+
raise PdfminerException(e)
280289
self._layout: LTPage = device.get_result()
281290
return self._layout
282291

@@ -339,7 +348,10 @@ def parse(annot: T_obj) -> T_obj:
339348
parsed["data"] = annot
340349
return parsed
341350

342-
raw = resolve_all(self.page_obj.annots) or []
351+
try:
352+
raw = resolve_all(self.page_obj.annots) or []
353+
except RecursionError:
354+
raise MalformedPDFException("Annotations are infinitely recursive.")
343355
parsed = list(map(parse, raw))
344356
if isinstance(self, CroppedPage):
345357
return self._crop_fn(parsed)

pdfplumber/pdf.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import pathlib
44
from io import BufferedReader, BytesIO
55
from types import TracebackType
6-
from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
6+
from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, Type, Union
77

88
from pdfminer.layout import LAParams
99
from pdfminer.pdfdocument import PDFDocument
@@ -18,6 +18,7 @@
1818
from .repair import T_repair_setting, _repair
1919
from .structure import PDFStructTree, StructTreeMissing
2020
from .utils import resolve_and_decode
21+
from .utils.exceptions import PdfminerException
2122

2223
logger = logging.getLogger(__name__)
2324

@@ -46,7 +47,10 @@ def __init__(
4647
self.unicode_norm = unicode_norm
4748
self.raise_unicode_errors = raise_unicode_errors
4849

49-
self.doc = PDFDocument(PDFParser(stream), password=password or "")
50+
try:
51+
self.doc = PDFDocument(PDFParser(stream), password=password or "")
52+
except Exception as e:
53+
raise PdfminerException(e)
5054
self.rsrcmgr = PDFResourceManager()
5155
self.metadata = {}
5256

@@ -146,7 +150,18 @@ def pages(self) -> List[Page]:
146150
doctop: T_num = 0
147151
pp = self.pages_to_parse
148152
self._pages: List[Page] = []
149-
for i, page in enumerate(PDFPage.create_pages(self.doc)):
153+
154+
def iter_pages() -> Generator[PDFPage, None, None]:
155+
gen = PDFPage.create_pages(self.doc)
156+
while True:
157+
try:
158+
yield next(gen)
159+
except StopIteration:
160+
break
161+
except Exception as e:
162+
raise PdfminerException(e)
163+
164+
for i, page in enumerate(iter_pages()):
150165
page_number = i + 1
151166
if pp is not None and page_number not in pp:
152167
continue

pdfplumber/utils/exceptions.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
class MalformedPDFException(Exception):
2+
pass
3+
4+
5+
class PdfminerException(Exception):
6+
pass
131 KB
Binary file not shown.
123 KB
Binary file not shown.
1.54 KB
Binary file not shown.
137 KB
Binary file not shown.
4.64 KB
Binary file not shown.

0 commit comments

Comments
 (0)