file_readers.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. """File reader skills for the OpenHands agent.
  2. This module provides various functions to parse and extract content from different file types,
  3. including PDF, DOCX, LaTeX, audio, image, video, and PowerPoint files. It utilizes different
  4. libraries and APIs to process these files and output their content or descriptions.
  5. Functions:
  6. parse_pdf(file_path: str) -> None: Parse and print content of a PDF file.
  7. parse_docx(file_path: str) -> None: Parse and print content of a DOCX file.
  8. parse_latex(file_path: str) -> None: Parse and print content of a LaTeX file.
  9. parse_audio(file_path: str, model: str = 'whisper-1') -> None: Transcribe and print content of an audio file.
  10. parse_image(file_path: str, task: str = 'Describe this image as detail as possible.') -> None: Analyze and print description of an image file.
  11. parse_video(file_path: str, task: str = 'Describe this image as detail as possible.', frame_interval: int = 30) -> None: Analyze and print description of video frames.
  12. parse_pptx(file_path: str) -> None: Parse and print content of a PowerPoint file.
  13. Note:
  14. Some functions (parse_audio, parse_video, parse_image) require OpenAI API credentials
  15. and are only available if the necessary environment variables are set.
  16. """
  17. import base64
  18. import docx
  19. import PyPDF2
  20. from pptx import Presentation
  21. from pylatexenc.latex2text import LatexNodes2Text
  22. from openhands.runtime.plugins.agent_skills.utils.config import (
  23. _get_max_token,
  24. _get_openai_api_key,
  25. _get_openai_base_url,
  26. _get_openai_client,
  27. _get_openai_model,
  28. )
  29. def parse_pdf(file_path: str) -> None:
  30. """Parses the content of a PDF file and prints it.
  31. Args:
  32. file_path: str: The path to the file to open.
  33. """
  34. print(f'[Reading PDF file from {file_path}]')
  35. content = PyPDF2.PdfReader(file_path)
  36. text = ''
  37. for page_idx in range(len(content.pages)):
  38. text += (
  39. f'@@ Page {page_idx + 1} @@\n'
  40. + content.pages[page_idx].extract_text()
  41. + '\n\n'
  42. )
  43. print(text.strip())
  44. def parse_docx(file_path: str) -> None:
  45. """Parses the content of a DOCX file and prints it.
  46. Args:
  47. file_path: str: The path to the file to open.
  48. """
  49. print(f'[Reading DOCX file from {file_path}]')
  50. content = docx.Document(file_path)
  51. text = ''
  52. for i, para in enumerate(content.paragraphs):
  53. text += f'@@ Page {i + 1} @@\n' + para.text + '\n\n'
  54. print(text)
  55. def parse_latex(file_path: str) -> None:
  56. """Parses the content of a LaTex file and prints it.
  57. Args:
  58. file_path: str: The path to the file to open.
  59. """
  60. print(f'[Reading LaTex file from {file_path}]')
  61. with open(file_path) as f:
  62. data = f.read()
  63. text = LatexNodes2Text().latex_to_text(data)
  64. print(text.strip())
  65. def _base64_img(file_path: str) -> str:
  66. with open(file_path, 'rb') as image_file:
  67. encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
  68. return encoded_image
  69. def _base64_video(file_path: str, frame_interval: int = 10) -> list[str]:
  70. import cv2
  71. video = cv2.VideoCapture(file_path)
  72. base64_frames = []
  73. frame_count = 0
  74. while video.isOpened():
  75. success, frame = video.read()
  76. if not success:
  77. break
  78. if frame_count % frame_interval == 0:
  79. _, buffer = cv2.imencode('.jpg', frame)
  80. base64_frames.append(base64.b64encode(buffer).decode('utf-8'))
  81. frame_count += 1
  82. video.release()
  83. return base64_frames
  84. def _prepare_image_messages(task: str, base64_image: str):
  85. return [
  86. {
  87. 'role': 'user',
  88. 'content': [
  89. {'type': 'text', 'text': task},
  90. {
  91. 'type': 'image_url',
  92. 'image_url': {'url': f'data:image/jpeg;base64,{base64_image}'},
  93. },
  94. ],
  95. }
  96. ]
  97. def parse_audio(file_path: str, model: str = 'whisper-1') -> None:
  98. """Parses the content of an audio file and prints it.
  99. Args:
  100. file_path: str: The path to the audio file to transcribe.
  101. model: str: The audio model to use for transcription. Defaults to 'whisper-1'.
  102. """
  103. print(f'[Transcribing audio file from {file_path}]')
  104. try:
  105. # TODO: record the COST of the API call
  106. with open(file_path, 'rb') as audio_file:
  107. transcript = _get_openai_client().audio.translations.create(
  108. model=model, file=audio_file
  109. )
  110. print(transcript.text)
  111. except Exception as e:
  112. print(f'Error transcribing audio file: {e}')
  113. def parse_image(
  114. file_path: str, task: str = 'Describe this image as detail as possible.'
  115. ) -> None:
  116. """Parses the content of an image file and prints the description.
  117. Args:
  118. file_path: str: The path to the file to open.
  119. task: str: The task description for the API call. Defaults to 'Describe this image as detail as possible.'.
  120. """
  121. print(f'[Reading image file from {file_path}]')
  122. # TODO: record the COST of the API call
  123. try:
  124. base64_image = _base64_img(file_path)
  125. response = _get_openai_client().chat.completions.create(
  126. model=_get_openai_model(),
  127. messages=_prepare_image_messages(task, base64_image),
  128. max_tokens=_get_max_token(),
  129. )
  130. content = response.choices[0].message.content
  131. print(content)
  132. except Exception as error:
  133. print(f'Error with the request: {error}')
  134. def parse_video(
  135. file_path: str,
  136. task: str = 'Describe this image as detail as possible.',
  137. frame_interval: int = 30,
  138. ) -> None:
  139. """Parses the content of an image file and prints the description.
  140. Args:
  141. file_path: str: The path to the video file to open.
  142. task: str: The task description for the API call. Defaults to 'Describe this image as detail as possible.'.
  143. frame_interval: int: The interval between frames to analyze. Defaults to 30.
  144. """
  145. print(
  146. f'[Processing video file from {file_path} with frame interval {frame_interval}]'
  147. )
  148. task = task or 'This is one frame from a video, please summarize this frame.'
  149. base64_frames = _base64_video(file_path)
  150. selected_frames = base64_frames[::frame_interval]
  151. if len(selected_frames) > 30:
  152. new_interval = len(base64_frames) // 30
  153. selected_frames = base64_frames[::new_interval]
  154. print(f'Totally {len(selected_frames)} would be analyze...\n')
  155. idx = 0
  156. for base64_frame in selected_frames:
  157. idx += 1
  158. print(f'Process the {file_path}, current No. {idx * frame_interval} frame...')
  159. # TODO: record the COST of the API call
  160. try:
  161. response = _get_openai_client().chat.completions.create(
  162. model=_get_openai_model(),
  163. messages=_prepare_image_messages(task, base64_frame),
  164. max_tokens=_get_max_token(),
  165. )
  166. content = response.choices[0].message.content
  167. current_frame_content = f"Frame {idx}'s content: {content}\n"
  168. print(current_frame_content)
  169. except Exception as error:
  170. print(f'Error with the request: {error}')
  171. def parse_pptx(file_path: str) -> None:
  172. """Parses the content of a pptx file and prints it.
  173. Args:
  174. file_path: str: The path to the file to open.
  175. """
  176. print(f'[Reading PowerPoint file from {file_path}]')
  177. try:
  178. pres = Presentation(str(file_path))
  179. text = []
  180. for slide_idx, slide in enumerate(pres.slides):
  181. text.append(f'@@ Slide {slide_idx + 1} @@')
  182. for shape in slide.shapes:
  183. if hasattr(shape, 'text'):
  184. text.append(shape.text)
  185. print('\n'.join(text))
  186. except Exception as e:
  187. print(f'Error reading PowerPoint file: {e}')
  188. __all__ = [
  189. 'parse_pdf',
  190. 'parse_docx',
  191. 'parse_latex',
  192. 'parse_pptx',
  193. ]
  194. # This is called from OpenHands's side
  195. # If SANDBOX_ENV_OPENAI_API_KEY is set, we will be able to use these tools in the sandbox environment
  196. if _get_openai_api_key() and _get_openai_base_url():
  197. __all__ += ['parse_audio', 'parse_video', 'parse_image']