Some Presentations does not open at all
Failure to open some Presentations, not all
from pptx import Presentation
prs = Presentation("C:\\Users\\ISAT\\Desktop\\Teste PPT\\43733.pptx")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Users\ISAT\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pptx\api.py", line 28, in Presentation
presentation_part = Package.open(pptx).main_document_part
File "C:\Users\ISAT\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pptx\opc\package.py", line 73, in open
return cls(pkg_file)._load()
File "C:\Users\ISAT\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pptx\opc\package.py", line 157, in _load
pkg_xml_rels, parts = _PackageLoader.load(self._pkg_file, self)
File "C:\Users\ISAT\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pptx\opc\package.py", line 186, in load
return cls(pkg_file, package)._load()
File "C:\Users\ISAT\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pptx\opc\package.py", line 190, in _load
parts, xml_rels = self._parts, self._xml_rels
File "C:\Users\ISAT\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pptx\util.py", line 215, in __get__
value = self._fget(obj)
File "C:\Users\ISAT\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pptx\opc\package.py", line 219, in _parts
content_types = self._content_types
File "C:\Users\ISAT\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pptx\util.py", line 215, in __get__
value = self._fget(obj)
File "C:\Users\ISAT\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pptx\opc\package.py", line 203, in _content_types
return _ContentTypeMap.from_xml(self._package_reader[CONTENT_TYPES_URI])
File "C:\Users\ISAT\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pptx\opc\serialized.py", line 35, in __getitem__
return self._blob_reader[pack_uri]
File "C:\Users\ISAT\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pptx\opc\serialized.py", line 176, in __getitem__
if pack_uri not in self._blobs:
File "C:\Users\ISAT\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pptx\util.py", line 215, in __get__
value = self._fget(obj)
File "C:\Users\ISAT\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pptx\opc\serialized.py", line 184, in _blobs
return {PackURI("/%s" % name): z.read(name) for name in z.namelist()}
File "C:\Users\ISAT\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pptx\opc\serialized.py", line 184, in <dictcomp>
return {PackURI("/%s" % name): z.read(name) for name in z.namelist()}
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\zipfile.py", line 1473, in read
return fp.read()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\zipfile.py", line 910, in read
buf += self._read1(self.MAX_N)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\zipfile.py", line 1014, in _read1
self._update_crc(data)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\zipfile.py", line 942, in _update_crc
raise BadZipFile("Bad CRC-32 for file %r" % self.name)
zipfile.BadZipFile: Bad CRC-32 for file 'ppt/media/media13.m4a'
String for location is not the problem, already tested it. This only occours on some pptx, can't figure why
For some reason it appears to only happen when there's an audio file m4a into the presentation
Upload an example pptx....
Hi, is there any solution for this? It's happening to me too. When there is any .mov file inserted in the presentation, it returns an error:
BadZipFile: Bad CRC-32 for file 'ppt/media/media2.mov'
I'm literally just trying to open the presentation with the constructor:
Presentation(path)
And it returns:
---------------------------------------------------------------------------
BadZipFile Traceback (most recent call last)
Input In [65], in <cell line: 1>()
----> 1 Presentation(path)
File ~\Anaconda3\envs\base_\lib\site-packages\pptx\api.py:28, in Presentation(pptx)
25 if pptx is None:
26 pptx = _default_pptx_path()
---> 28 presentation_part = Package.open(pptx).main_document_part
30 if not _is_pptx_package(presentation_part):
31 tmpl = "file '%s' is not a PowerPoint file, content type is '%s'"
File ~\Anaconda3\envs\base_\lib\site-packages\pptx\opc\package.py:73, in OpcPackage.open(cls, pkg_file)
70 @classmethod
71 def open(cls, pkg_file):
72 """Return an |OpcPackage| instance loaded with the contents of `pkg_file`."""
---> 73 return cls(pkg_file)._load()
File ~\Anaconda3\envs\base_\lib\site-packages\pptx\opc\package.py:157, in OpcPackage._load(self)
155 def _load(self):
156 """Return the package after loading all parts and relationships."""
--> 157 pkg_xml_rels, parts = _PackageLoader.load(self._pkg_file, self)
158 self._rels.load_from_xml(PACKAGE_URI, pkg_xml_rels, parts)
159 return self
File ~\Anaconda3\envs\base_\lib\site-packages\pptx\opc\package.py:186, in _PackageLoader.load(cls, pkg_file, package)
174 @classmethod
175 def load(cls, pkg_file, package):
176 """Return (pkg_xml_rels, parts) pair resulting from loading `pkg_file`.
177
178 The returned `parts` value is a {partname: part} mapping with each part in the
(...)
184 object) to load those relationships into its |_Relationships| object.
185 """
--> 186 return cls(pkg_file, package)._load()
File ~\Anaconda3\envs\base_\lib\site-packages\pptx\opc\package.py:190, in _PackageLoader._load(self)
188 def _load(self):
189 """Return (pkg_xml_rels, parts) pair resulting from loading pkg_file."""
--> 190 parts, xml_rels = self._parts, self._xml_rels
192 for partname, part in parts.items():
193 part.load_rels_from_xml(xml_rels[partname], parts)
File ~\Anaconda3\envs\base_\lib\site-packages\pptx\util.py:215, in lazyproperty.__get__(self, obj, type)
210 value = obj.__dict__.get(self.__name__)
211 if value is None:
212 # ---on first access, __dict__ item will absent. Evaluate fget()
213 # ---and store that value in the (otherwise unused) host-object
214 # ---__dict__ value of same name ('fget' nominally)
--> 215 value = self._fget(obj)
216 obj.__dict__[self.__name__] = value
217 return value
File ~\Anaconda3\envs\base_\lib\site-packages\pptx\opc\package.py:219, in _PackageLoader._parts(self)
210 @lazyproperty
211 def _parts(self):
212 """dict {partname: Part} populated with parts loading from package.
213
214 Among other duties, this collection is passed to each relationships collection
(...)
217 loaded.
218 """
--> 219 content_types = self._content_types
220 package = self._package
221 package_reader = self._package_reader
File ~\Anaconda3\envs\base_\lib\site-packages\pptx\util.py:215, in lazyproperty.__get__(self, obj, type)
210 value = obj.__dict__.get(self.__name__)
211 if value is None:
212 # ---on first access, __dict__ item will absent. Evaluate fget()
213 # ---and store that value in the (otherwise unused) host-object
214 # ---__dict__ value of same name ('fget' nominally)
--> 215 value = self._fget(obj)
216 obj.__dict__[self.__name__] = value
217 return value
File ~\Anaconda3\envs\base_\lib\site-packages\pptx\opc\package.py:203, in _PackageLoader._content_types(self)
197 @lazyproperty
198 def _content_types(self):
199 """|_ContentTypeMap| object providing content-types for items of this package.
200
201 Provides a content-type (MIME-type) for any given partname.
202 """
--> 203 return _ContentTypeMap.from_xml(self._package_reader[CONTENT_TYPES_URI])
File ~\Anaconda3\envs\base_\lib\site-packages\pptx\opc\serialized.py:35, in PackageReader.__getitem__(self, pack_uri)
33 def __getitem__(self, pack_uri):
34 """Return bytes for part corresponding to `pack_uri`."""
---> 35 return self._blob_reader[pack_uri]
File ~\Anaconda3\envs\base_\lib\site-packages\pptx\opc\serialized.py:176, in _ZipPkgReader.__getitem__(self, pack_uri)
171 def __getitem__(self, pack_uri):
172 """Return bytes for part corresponding to `pack_uri`.
173
174 Raises |KeyError| if no matching member is present in zip archive.
175 """
--> 176 if pack_uri not in self._blobs:
177 raise KeyError("no member '%s' in package" % pack_uri)
178 return self._blobs[pack_uri]
File ~\Anaconda3\envs\base_\lib\site-packages\pptx\util.py:215, in lazyproperty.__get__(self, obj, type)
210 value = obj.__dict__.get(self.__name__)
211 if value is None:
212 # ---on first access, __dict__ item will absent. Evaluate fget()
213 # ---and store that value in the (otherwise unused) host-object
214 # ---__dict__ value of same name ('fget' nominally)
--> 215 value = self._fget(obj)
216 obj.__dict__[self.__name__] = value
217 return value
File ~\Anaconda3\envs\base_\lib\site-packages\pptx\opc\serialized.py:184, in _ZipPkgReader._blobs(self)
182 """dict mapping partname to package part binaries."""
183 with zipfile.ZipFile(self._pkg_file, "r") as z:
--> 184 return {PackURI("/%s" % name): z.read(name) for name in z.namelist()}
File ~\Anaconda3\envs\base_\lib\site-packages\pptx\opc\serialized.py:184, in <dictcomp>(.0)
182 """dict mapping partname to package part binaries."""
183 with zipfile.ZipFile(self._pkg_file, "r") as z:
--> 184 return {PackURI("/%s" % name): z.read(name) for name in z.namelist()}
File ~\Anaconda3\envs\base_\lib\zipfile.py:1474, in ZipFile.read(self, name, pwd)
1472 """Return file bytes for name."""
1473 with self.open(name, "r", pwd) as fp:
-> 1474 return fp.read()
File ~\Anaconda3\envs\base_\lib\zipfile.py:911, in ZipExtFile.read(self, n)
909 self._offset = 0
910 while not self._eof:
--> 911 buf += self._read1(self.MAX_N)
912 return buf
914 end = n + self._offset
File ~\Anaconda3\envs\base_\lib\zipfile.py:1015, in ZipExtFile._read1(self, n)
1013 if self._left <= 0:
1014 self._eof = True
-> 1015 self._update_crc(data)
1016 return data
File ~\Anaconda3\envs\base_\lib\zipfile.py:943, in ZipExtFile._update_crc(self, newdata)
941 # Check the CRC if we're at the end of the file
942 if self._eof and self._running_crc != self._expected_crc:
--> 943 raise BadZipFile("Bad CRC-32 for file %r" % self.name)
BadZipFile: Bad CRC-32 for file 'ppt/media/media2.mov'```
No solution using this library for me, but since what I was trying to do was get the length of a presentation, i could do it using the win32com library, kinda like this
import win32com.client
#prs = Presentation(path_to_pptx.replace("'",""))
#slides_max = len(list(prs.slides))
#console.log(slides_max)
Application = win32com.client.Dispatch("PowerPoint.Application")
Presentation = Application.Presentations.Open(path_to_pptx.replace("'",""))
slides_max = len(Presentation.Slides)
Presentation.Close()
The commented out part is what you'ld with pptx library, the non-commented part is with the win32com lib.
Maybe it can help, but the win32com lib is not friendly and doc is old by this time.
Upload an example pptx....
Sorry, i wish i could, but i've got no authority to do so... Hope one can simulate it with an empty presentations and a media-file, since this seems to happen with .mov files aswell
I have similar issue. I'm using python 3.10.15 + python-pptx 1.0.2 on linux/amd64. Here's the backtrace:
Traceback (most recent call last):
File "/ragflow/rag/svr/task_executor.py", line 448, in handle_task
do_handle_task(task)
File "/ragflow/rag/svr/task_executor.py", line 387, in do_handle_task
cks = build(r)
File "/ragflow/rag/svr/task_executor.py", line 199, in build
cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
File "/ragflow/rag/app/presentation.py", line 106, in chunk
for pn, (txt, img) in enumerate(ppt_parser(
File "/ragflow/rag/app/presentation.py", line 28, in __call__
txts = super().__call__(fnm, from_page, to_page)
File "/ragflow/deepdoc/parser/ppt_parser.py", line 44, in __call__
fnm, str) else Presentation(
File "/ragflow/.venv/lib/python3.10/site-packages/pptx/api.py", line 31, in Presentation
presentation_part = Package.open(pptx).main_document_part
File "/ragflow/.venv/lib/python3.10/site-packages/pptx/opc/package.py", line 82, in open
return cls(pkg_file)._load()
File "/ragflow/.venv/lib/python3.10/site-packages/pptx/opc/package.py", line 160, in _load
pkg_xml_rels, parts = _PackageLoader.load(self._pkg_file, cast("Package", self))
File "/ragflow/.venv/lib/python3.10/site-packages/pptx/opc/package.py", line 190, in load
return cls(pkg_file, package)._load()
File "/ragflow/.venv/lib/python3.10/site-packages/pptx/opc/package.py", line 194, in _load
parts, xml_rels = self._parts, self._xml_rels
File "/ragflow/.venv/lib/python3.10/site-packages/pptx/util.py", line 191, in __get__
value = self._fget(obj)
File "/ragflow/.venv/lib/python3.10/site-packages/pptx/opc/package.py", line 222, in _parts
content_types = self._content_types
File "/ragflow/.venv/lib/python3.10/site-packages/pptx/util.py", line 191, in __get__
value = self._fget(obj)
File "/ragflow/.venv/lib/python3.10/site-packages/pptx/opc/package.py", line 207, in _content_types
return _ContentTypeMap.from_xml(self._package_reader[CONTENT_TYPES_URI])
File "/ragflow/.venv/lib/python3.10/site-packages/pptx/opc/serialized.py", line 38, in __getitem__
return self._blob_reader[pack_uri]
File "/ragflow/.venv/lib/python3.10/site-packages/pptx/opc/serialized.py", line 187, in __getitem__
if pack_uri not in self._blobs:
File "/ragflow/.venv/lib/python3.10/site-packages/pptx/util.py", line 191, in __get__
value = self._fget(obj)
File "/ragflow/.venv/lib/python3.10/site-packages/pptx/opc/serialized.py", line 194, in _blobs
with zipfile.ZipFile(self._pkg_file, "r") as z:
File "/usr/lib/python3.10/zipfile.py", line 1272, in __init__
self._RealGetContents()
File "/usr/lib/python3.10/zipfile.py", line 1339, in _RealGetContents
raise BadZipFile("File is not a zip file")
zipfile.BadZipFile: File is not a zip file
Here's the sample ppt file: test.ppt.zip
Are you saying you're trying this with a .ppt file? python-pptx only works with .pptx and .pptm files.
@MartinPacker What python package work with .ppt files?
None that I'm aware of. Perhaps load and save to .pptx in PowerPoint first.
State of the art (such as it is) for .ppt files on Python is to convert to .pptx using LibreOffice. There is an soffice command that installs with LibreOffice which allows converting PPT -> PPTX from the command line. This can be incorporated into Python code if you like. Here's one example elsewhere in open-source code: https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/partition/common/common.py#L250
Note the soffice process is single-threaded so you'll get silent failures if you're running in a multi-threaded environment and end up calling soffice a second time while it's already running.
Easy to do from the command-line if you don't need a server-side solution.
LibreOffice is quite a big install unfortunately, but can be handy to have around locally if you're working on a Mac or Linux.