The PDF
or Portable Document Format
provides an easy way to transfer text documents
, To make PDF
navigation easy, it is good practice to add a Content Catalog
or Table of Contents
of the PDF
in the initial pages. The Spring AI ETL PDF Paragraph Document Reader
uses the information provided in the PDF catalog
to split the long PDF
into small chunks
of text paragraphs
and output them as a single Document
per paragraph.
Not all PDF documents provide the PDF catalog.
package com.example.springai.controller;
import org.springframework.ai.document.Document;
import org.springframework.ai.reader.ExtractedTextFormatter;
import org.springframework.ai.reader.pdf.ParagraphPdfDocumentReader;
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;
import java.util.List;
@RestController
public class SpringAiController {
private final Resource resource;
public SpringAiController(@Value("classpath:spring-boot-reference.pdf") Resource resource) {
this.resource = resource;
}
@GetMapping("/pdfDocument")
List<Document> pdfDocument() {
ParagraphPdfDocumentReader pdfReader = new ParagraphPdfDocumentReader(resource, PdfDocumentReaderConfig.builder()
.withPageTopMargin(0)
.withPageExtractedTextFormatter(ExtractedTextFormatter.builder()
.withNumberOfTopTextLinesToDelete(0)
.build())
.withPagesPerDocument(1)
.build());
return pdfReader.read();
}
}
package com.example.springai;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
@SpringBootApplication
public class SpringAiApplication {
public static void main(String[] args) {
SpringApplication.run(SpringAiApplication.class, args);
}
}
spring.application.name=SpringAi
spring.docker.compose.lifecycle-management=start-only
spring.threads.virtual.enabled=true
# The default Ollama Model in Spring Ai is mistral, but it can be changed by setting the below property. make sure to download the same model in entrypoint.sh file
#spring.ai.ollama.chat.options.model=llama3.1
# If running the Ollama Docker Instance separately, then set this property
spring.docker.compose.enabled=false
services:
ollama-model:
image: ollama/ollama:latest
container_name: ollama_container
ports:
- 11434:11434/tcp
healthcheck:
test: ollama --version || exit 1
command: serve
volumes:
- ./ollama/ollama:/root/.ollama
- ./entrypoint.sh:/entrypoint.sh
pull_policy: missing
tty: true
restart: no
entrypoint: [ "/usr/bin/bash", "/entrypoint.sh" ]
open-webui:
image: ghcr.io/open-webui/open-webui:main
container_name: open_webui_container
environment:
WEBUI_AUTH: false
ports:
- "8081:8080"
extra_hosts:
- "host.docker.internal:host-gateway"
volumes:
- open-webui:/app/backend/data
restart: no
volumes:
open-webui:
#!/bin/bash
# Start Ollama in the background.
/bin/ollama serve &
# Record Process ID.
pid=$!
# Pause for Ollama to start.
sleep 5
# The default Ollama Model in Spring Ai is mistral, but it can be changed in the applications property file. Make sure to download the same Model here
echo "🔴 Retrieve LLAMA3 model..."
ollama pull mistral
echo "🟢 Done!"
# Wait for the Ollama process to finish.
wait $pid
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>3.3.2</version>
<relativePath/>
</parent>
<groupId>com.example.springai</groupId>
<artifactId>etl-pipeline-text-pdf-paragraph</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>ETL Pipeline Text PDF Paragraph</name>
<description>Demo project for Spring Boot</description>
<properties>
<java.version>21</java.version>
<spring-ai.version>1.0.0-SNAPSHOT</spring-ai.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-ollama-spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-docker-compose</artifactId>
<scope>runtime</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-pdf-document-reader</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-bom</artifactId>
<version>${spring-ai.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<mainClass>com.example.springai.SpringAiApplication</mainClass>
<excludes>
<exclude>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>spring-milestones</id>
<name>Spring Milestones</name>
<url>https://repo.spring.io/milestone</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
<repository>
<id>spring-snapshots</id>
<name>Spring Snapshots</name>
<url>https://repo.spring.io/snapshot</url>
<releases>
<enabled>false</enabled>
</releases>
</repository>
</repositories>
</project>
Run the curl to see the Spring AI ETL PDF Paragraph Document Reader
http://localhost:8080/pdfDocument