-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
302 additions
and
140 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import axios from "axios"; | ||
import { Service } from "kiss-framework"; | ||
import { base64 } from "multiformats/bases/base64"; | ||
|
||
type CollectionData = { | ||
collection_name: string, | ||
filename: string, | ||
known_type: boolean, | ||
status: boolean | ||
} | ||
|
||
@Service | ||
export class OllamaProxyService { | ||
|
||
|
||
async extractEmailsFromPdf(pdfText): Promise<string[]> { | ||
// Make a POST request to the Ollama API | ||
const response = await axios.post('http://192.168.178.208:11434/api/generate', { | ||
model: 'extract-emails', | ||
prompt: pdfText, | ||
stream: false, | ||
// images: [base64PdfData] // Assuming the API accepts the PDF data as a field in the JSON payload | ||
}); | ||
console.debug(`response.data:`, response.data); | ||
// Assume the API returns a list of emails in the response data | ||
return response.data; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import { json } from '@helia/json'; | ||
import { Controller, Get, assert } from "kiss-framework"; | ||
import { OllamaProxyService } from "../services/ollama.service.js"; | ||
import { HeliaService } from '../services/helia.service.js'; | ||
import { base64 } from 'multiformats/bases/base64'; | ||
import { CID } from 'multiformats'; | ||
import { dagCbor } from '@helia/dag-cbor' | ||
import { unixfs } from '@helia/unixfs'; | ||
import pdfParse from 'pdf-parse-fork'; | ||
|
||
export function isCid(link: string | { [key: string]: string }): boolean { | ||
// CIDv1: Starts with "b" followed by base encoding character (z or k or m) followed by multihash | ||
// CIDv0: Multihash encoded in base58btc | ||
// This is a very basic check and may not cover all possible CID formats | ||
const cidRegex = /^(b[a-zA-Z0-9]{1,})|([Qm][a-zA-Z0-9]{44})$/; | ||
|
||
if (typeof link === 'string') { | ||
return cidRegex.test(link); | ||
} else if (typeof link === 'object' && link !== null) { | ||
return link.hasOwnProperty('/') && typeof link['/'] === 'string' && cidRegex.test(link['/']); | ||
} | ||
|
||
return false; | ||
} | ||
|
||
@Controller | ||
export class extractEmailsFromPdfController { | ||
|
||
private ollama = new OllamaProxyService() | ||
|
||
private helia = new HeliaService() | ||
|
||
@Get('pdf') | ||
async extractEmails(pdf: string): Promise<string[]> { | ||
console.debug(`extractEmails. pdf:`, pdf); | ||
assert(isCid(pdf), 'pdf is not a valid cid'); | ||
console.debug(`pdf is a valid cid`); | ||
const heliaNode = this.helia.heliaNode | ||
const fs = unixfs(heliaNode) | ||
console.debug(`extractEmails. fs working`); | ||
const pdfCid = CID.parse(pdf) | ||
console.debug(`pdfCid:`, pdfCid); | ||
let pdfData = new Uint8Array(); | ||
for await (const chunk of fs.cat(pdfCid)) { | ||
// console.debug(`chunk:`, chunk); | ||
const chunkData = new Uint8Array(chunk); | ||
// console.debug(`chunkData:`, chunkData); | ||
pdfData = new Uint8Array([...pdfData, ...chunkData]); | ||
} | ||
console.debug(`pdfData length:`, pdfData.length); | ||
const pdfBuffer = Buffer.from(pdfData); | ||
const data = await pdfParse(pdfBuffer); | ||
const textContent = data.text.trim(); | ||
console.debug(`pdfInfo:`, textContent); | ||
const emails = await this.ollama.extractEmailsFromPdf(textContent); | ||
// Assume the API returns a list of emails in the response data | ||
return emails['response']; | ||
} | ||
|
||
} |
Oops, something went wrong.