@@ -67,11 +67,66 @@ def extract_pdf_images(pdf_path: Path, doc_name: str, images_dir: Path) -> dict[
6767 logger .warning ("Failed to save image block on page %d" , page_num )
6868 continue
6969
70- rel_path = f"images/{ doc_name } /{ filename } "
70+ rel_path = f"sources/ images/{ doc_name } /{ filename } "
7171 page_images .setdefault (page_num , []).append (rel_path )
7272 return page_images
7373
7474
75+ def convert_pdf_to_pages (pdf_path : Path , doc_name : str , images_dir : Path ) -> list [dict ]:
76+ """Convert a PDF to per-page dicts with text content and images.
77+
78+ Each dict has ``{"page": int, "content": str, "images": [{"path": str}]}``.
79+ Images are saved to *images_dir* and referenced with wiki-root-relative paths.
80+ """
81+ images_dir .mkdir (parents = True , exist_ok = True )
82+ pages : list [dict ] = []
83+ img_counter = 0
84+
85+ with pymupdf .open (str (pdf_path )) as doc :
86+ for page_idx in range (len (doc )):
87+ page = doc [page_idx ]
88+ page_num = page_idx + 1
89+ parts : list [str ] = []
90+ page_images : list [dict ] = []
91+
92+ for block in page .get_text ("dict" )["blocks" ]:
93+ if block ["type" ] == 0 : # text block
94+ lines = []
95+ for line in block ["lines" ]:
96+ spans_text = "" .join (span ["text" ] for span in line ["spans" ])
97+ lines .append (spans_text )
98+ parts .append ("\n " .join (lines ))
99+
100+ elif block ["type" ] == 1 : # image block
101+ width = block .get ("width" , 0 )
102+ height = block .get ("height" , 0 )
103+ if width < _MIN_IMAGE_DIM or height < _MIN_IMAGE_DIM :
104+ continue
105+ image_bytes = block .get ("image" )
106+ if not image_bytes :
107+ continue
108+ try :
109+ pix = pymupdf .Pixmap (image_bytes )
110+ if pix .n > 4 :
111+ pix = pymupdf .Pixmap (pymupdf .csRGB , pix )
112+ img_counter += 1
113+ filename = f"p{ page_num } _img{ img_counter } .png"
114+ (images_dir / filename ).write_bytes (pix .tobytes ("png" ))
115+ pix = None
116+ img_path = f"sources/images/{ doc_name } /{ filename } "
117+ parts .append (f"\n \n " )
118+ page_images .append ({"path" : img_path })
119+ except Exception :
120+ logger .warning ("Failed to save image block on page %d" , page_num )
121+
122+ pages .append ({
123+ "page" : page_num ,
124+ "content" : "\n " .join (parts ),
125+ "images" : page_images ,
126+ })
127+ return pages
128+
129+
75130def convert_pdf_with_images (pdf_path : Path , doc_name : str , images_dir : Path ) -> str :
76131 """Convert a PDF to markdown with inline images using pymupdf dict-mode.
77132
@@ -115,7 +170,7 @@ def convert_pdf_with_images(pdf_path: Path, doc_name: str, images_dir: Path) ->
115170 filename = f"p{ page_num } _img{ img_counter } .png"
116171 (images_dir / filename ).write_bytes (pix .tobytes ("png" ))
117172 pix = None
118- parts .append (f"\n \n " )
173+ parts .append (f"\n \n " )
119174 except Exception :
120175 logger .warning ("Failed to save image block on page %d" , page_num )
121176 return "\n " .join (parts )
@@ -126,7 +181,7 @@ def extract_base64_images(markdown: str, doc_name: str, images_dir: Path) -> str
126181
127182 For each ```` match:
128183 - Decode base64 bytes → save to ``images_dir/img_NNN.ext``
129- - Replace the link with ````
184+ - Replace the link with ````
130185 - On decode failure: log a warning and leave the original text unchanged.
131186 """
132187 counter = 0
@@ -150,7 +205,7 @@ def extract_base64_images(markdown: str, doc_name: str, images_dir: Path) -> str
150205 images_dir .mkdir (parents = True , exist_ok = True )
151206 dest .write_bytes (image_bytes )
152207
153- new_ref = f""
208+ new_ref = f""
154209 result = result .replace (match .group (0 ), new_ref , 1 )
155210
156211 return result
@@ -164,7 +219,7 @@ def copy_relative_images(
164219 For each ```` match (skipping http/https and data URIs):
165220 - Resolve path relative to ``source_dir``
166221 - Copy to ``images_dir/{filename}``
167- - Replace link with ````
222+ - Replace link with ````
168223 - Missing source file: log a warning and leave the original text unchanged.
169224 """
170225 result = markdown
@@ -186,7 +241,7 @@ def copy_relative_images(
186241 images_dir .mkdir (parents = True , exist_ok = True )
187242 shutil .copy2 (src , dest )
188243
189- new_ref = f""
244+ new_ref = f""
190245 result = result .replace (match .group (0 ), new_ref , 1 )
191246
192247 return result
0 commit comments