The Spring AI ETL Token Text Splitter Transformer is an implementation of TextSplitter that splits large text into small chunks of Documents based on the token count, it uses the CL100K_BASE encoding. The Spring AI ETL Token Text Splitter Transformer Customized constructor allows to customize some parameters based on requirements, such as chunkSize, minChunkSizeChars, minChunkLengthToEmbed, maxNumChunks, etc
package com.example.springai.controller;
import org.springframework.ai.chat.model.ChatModel;
import org.springframework.ai.document.Document;
import org.springframework.ai.reader.TextReader;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;
import java.util.List;
@RestController
public class SpringAiController {
private final Resource apples;
@Autowired
private ChatModel chatModel;
public SpringAiController(@Value("classpath:apples.st") Resource apples) {
this.apples = apples;
}
@GetMapping("/textDocument")
List<Document> textDocument() {
var textReader = new TextReader(apples);
var documents = textReader.get();
var splitter = new TokenTextSplitter(true);
return splitCustomized(splitter.apply(documents));
}
public List<Document> splitCustomized(List<Document> documents) {
TokenTextSplitter splitter = new TokenTextSplitter(100, 40, 10, 500, true);
return splitter.apply(documents);
}
}
package com.example.springai;
import org.springframework.ai.chat.model.ChatModel;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
@SpringBootApplication
public class SpringAiApplication {
@Autowired
private ChatModel chatModel;
public static void main(String[] args) {
SpringApplication.run(SpringAiApplication.class, args);
}
}
create a file apples.st in src/main/resources directory
spring.application.name=SpringAi
spring.docker.compose.lifecycle-management=start-only
spring.threads.virtual.enabled=true
# The default Ollama Model in Spring Ai is mistral, but it can be changed by setting the below property. make sure to download the same model in entrypoint.sh file
#spring.ai.ollama.chat.options.model=llama3.1
# If running the Ollama Docker Instance separately, then set this property
spring.docker.compose.enabled=false
services:
ollama-model:
image: ollama/ollama:latest
container_name: ollama_container
ports:
- 11434:11434/tcp
healthcheck:
test: ollama --version || exit 1
command: serve
volumes:
- ./ollama/ollama:/root/.ollama
- ./entrypoint.sh:/entrypoint.sh
pull_policy: missing
tty: true
restart: no
entrypoint: [ "/usr/bin/bash", "/entrypoint.sh" ]
open-webui:
image: ghcr.io/open-webui/open-webui:main
container_name: open_webui_container
environment:
WEBUI_AUTH: false
ports:
- "8081:8080"
extra_hosts:
- "host.docker.internal:host-gateway"
volumes:
- open-webui:/app/backend/data
restart: no
volumes:
open-webui:
#!/bin/bash
# Start Ollama in the background.
/bin/ollama serve &
# Record Process ID.
pid=$!
# Pause for Ollama to start.
sleep 5
# The default Ollama Model in Spring Ai is mistral, but it can be changed in the applications property file. Make sure to download the same Model here
echo "🔴 Retrieve LLAMA3 model..."
ollama pull mistral
echo "🟢 Done!"
# Wait for the Ollama process to finish.
wait $pid
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>3.3.2</version>
<relativePath/>
</parent>
<groupId>com.example.springai</groupId>
<artifactId>etl_pipeline_token_text_splitter</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>ETL Pipeline Token Text Splitter</name>
<description>Demo project for Spring Boot</description>
<properties>
<java.version>21</java.version>
<spring-ai.version>1.0.0-SNAPSHOT</spring-ai.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-ollama-spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-docker-compose</artifactId>
<scope>runtime</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-bom</artifactId>
<version>${spring-ai.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<mainClass>com.example.springai.SpringAiApplication</mainClass>
<excludes>
<exclude>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>spring-milestones</id>
<name>Spring Milestones</name>
<url>https://repo.spring.io/milestone</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
<repository>
<id>spring-snapshots</id>
<name>Spring Snapshots</name>
<url>https://repo.spring.io/snapshot</url>
<releases>
<enabled>false</enabled>
</releases>
</repository>
</repositories>
</project>
Run the curl to see the Spring AI ETL Document JSON Pointer Reader
http://localhost:8080/jsonDocument