-
-
Notifications
You must be signed in to change notification settings - Fork 54
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add example for FTS4 of SQLite3
- Loading branch information
Showing
3 changed files
with
227 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,197 @@ | ||
/* | ||
# TL; DR | ||
This example provides a practical example of how to work with Japanese text data and perform efficient full-text search using Kagome and SQLite3. | ||
# TS; WM | ||
In this example, each line of text is inserted into a row of the SQLite3 database, and then the database is searched for the word "かき". | ||
Note that the string tokenized by Kagome, a.k.a. "Wakati", is recorded in a separate table for FTS (Full-Text-Search) at the same time as the original text. | ||
This allows Unicode text data that is not separated by spaces, such as Japanese, to be searched by FTS. | ||
Aim of this example: | ||
This example can be useful in scenarios where you need to perform full-text searches on Japanese text. It demonstrates how to tokenize Japanese text using Kagome, which is a common requirement when working with text data in the Japanese language. By using SQLite with FTS4, it efficiently manages and searches through a large amount of text data, making it suitable for applications like: | ||
1. **Search Engines:** You can use this code as a basis for building a search engine that indexes and searches Japanese text content. | ||
2. **Document Management Systems:** This code can be integrated into a document management system to enable full-text search capabilities for Japanese documents. | ||
3. **Content Recommendation Systems:** When you have a large collection of Japanese content, you can use this code to implement content recommendation systems based on user queries. | ||
4. **Chatbots and NLP:** If you're building chatbots or natural language processing (NLP) systems for Japanese language, this code can assist in text analysis and search within the chatbot's knowledge base. | ||
Acknowledgements: | ||
This example is taken in part from the following book for reference. | ||
- p.372, 9.2 "データーベース登録プログラム", "Go言語プログラミングエッセンス エンジニア選書" | ||
- Written by: Mattn | ||
- Published: 2023/3/9 (技術評論社) | ||
- ISBN: 4297134195 / 978-4297134198 | ||
- ASIN: B0BVZCJQ4F / https://amazon.co.jp/dp/4297134195 | ||
*/ | ||
package main | ||
|
||
import ( | ||
"database/sql" | ||
"errors" | ||
"fmt" | ||
"log" | ||
"strings" | ||
|
||
"github.com/ikawaha/kagome-dict/ipa" | ||
"github.com/ikawaha/kagome/v2/tokenizer" | ||
_ "github.com/mattn/go-sqlite3" | ||
) | ||
|
||
func main() { | ||
// Search word | ||
const searchWord = "かき" | ||
|
||
// Contents to be inserted into the database. Each element represents a line | ||
// of text and will be inserted into a row of the database. | ||
lines := []string{ | ||
"あいうえお", | ||
"かきくけこ", | ||
"さしすせそ", | ||
"たちつてと", | ||
"なにぬねの", | ||
"はひふへほ", | ||
"まみむめも", | ||
"やゆよ", | ||
"らりるれろ", | ||
"わをん", | ||
} | ||
|
||
// Create a database. In-memory database is used for simplicity. | ||
db, err := sql.Open("sqlite3", ":memory:") | ||
PanicOnError(err) | ||
|
||
defer db.Close() | ||
|
||
// Create tables. | ||
// The first table "contents_fts" is for storing the original content, and | ||
// the second table "fts" is for storing the tokenized content. | ||
_, err = db.Exec(` | ||
CREATE TABLE IF NOT EXISTS contents_fts(docid INTEGER PRIMARY KEY AUTOINCREMENT, words TEXT); | ||
CREATE VIRTUAL TABLE IF NOT EXISTS fts USING fts4(content, tokenize=unicode61 "remove_diacritics=2"); | ||
`) | ||
PanicOnError(err) | ||
|
||
// Insert contents | ||
for _, line := range lines { | ||
rowID, err := insertContent(db, line) | ||
PanicOnError(err) | ||
|
||
err = insertSearchToken(db, rowID, line) | ||
PanicOnError(err) | ||
} | ||
|
||
// Search | ||
rowIDsFound, err := searchFTS4(db, searchWord) | ||
PanicOnError(err) | ||
|
||
// Print search results | ||
for _, rowID := range rowIDsFound { | ||
cont, err := retrieveContent(db, rowID) | ||
PanicOnError(err) | ||
|
||
fmt.Printf("Found content: %s at line: %v\n", cont, rowID) | ||
} | ||
// Output: | ||
// Searching for: かき | ||
// Found content: かきくけこ at line: 2 | ||
} | ||
|
||
func insertContent(db *sql.DB, content string) (int64, error) { | ||
res, err := db.Exec( | ||
`INSERT INTO contents_fts(words) VALUES(?)`, | ||
content, | ||
) | ||
if err != nil { | ||
return 0, err | ||
} | ||
|
||
return res.LastInsertId() | ||
} | ||
|
||
func insertSearchToken(db *sql.DB, rowID int64, content string) error { | ||
// This example uses the IPA dictionary, but it may be more efficient to use | ||
// the 'Uni' dictionary if memory is available. | ||
tknzr, err := tokenizer.New(ipa.Dict(), tokenizer.OmitBosEos()) | ||
PanicOnError(err) | ||
|
||
seg := tknzr.Wakati(content) | ||
tokenizedContent := strings.Join(seg, " ") | ||
|
||
_, err = db.Exec( | ||
`INSERT INTO fts(docid, content) VALUES(?, ?)`, | ||
rowID, | ||
tokenizedContent, | ||
) | ||
|
||
return err | ||
} | ||
|
||
// PanicOnError exits the program with panic if the given error is not nil. | ||
func PanicOnError(err error) { | ||
if err != nil { | ||
log.Panic(err) | ||
} | ||
} | ||
|
||
func retrieveContent(db *sql.DB, rowID int) (string, error) { | ||
rows, err := db.Query( | ||
`SELECT rowid, words FROM contents_fts WHERE rowid=?`, | ||
rowID, | ||
) | ||
if err != nil { | ||
return "", err | ||
} | ||
|
||
defer rows.Close() | ||
|
||
for rows.Next() { | ||
var foundID int | ||
var words string | ||
|
||
err := rows.Scan(&foundID, &words) | ||
if err != nil { | ||
return "", err | ||
} | ||
|
||
if foundID == rowID { | ||
return words, nil | ||
} | ||
} | ||
|
||
return "", errors.New("no content found") | ||
} | ||
|
||
func searchFTS4(db *sql.DB, searchWord string) ([]int, error) { | ||
fmt.Println("Searching for:", searchWord) | ||
|
||
rows, err := db.Query(`SELECT rowid, content FROM fts WHERE content MATCH ?`, searchWord) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
defer rows.Close() | ||
|
||
var lineIDs []int | ||
|
||
for rows.Next() { | ||
var lineID int | ||
var content string | ||
|
||
if err := rows.Scan(&lineID, &content); err != nil { | ||
return nil, err | ||
} | ||
|
||
//fmt.Printf("- Table: fts, RowID: %d, Value: %s\n", lineID, content) | ||
|
||
lineIDs = append(lineIDs, lineID) | ||
} | ||
|
||
return lineIDs, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
module kagome/examples | ||
|
||
go 1.19 | ||
|
||
require ( | ||
github.com/ikawaha/kagome-dict/ipa v1.0.10 | ||
github.com/ikawaha/kagome-dict/uni v1.1.9 | ||
github.com/ikawaha/kagome/v2 v2.9.3 | ||
github.com/mattn/go-sqlite3 v1.14.17 | ||
golang.org/x/exp v0.0.0-20230905200255-921286631fa9 | ||
) | ||
|
||
require ( | ||
github.com/ikawaha/kagome-dict v1.0.9 // indirect | ||
github.com/zenizh/go-capturer v0.0.0-20211219060012-52ea6c8fed04 // indirect | ||
) | ||
|
||
replace github.com/ikawaha/kagome/v2 => ../../ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
github.com/ikawaha/kagome-dict v1.0.9 h1:1Gg735LbBYsdFu13fdTvW6eVt0qIf5+S2qXGJtlG8C0= | ||
github.com/ikawaha/kagome-dict v1.0.9/go.mod h1:mn9itZLkFb6Ixko7q8eZmUabHbg3i9EYewnhOtvd2RM= | ||
github.com/ikawaha/kagome-dict/ipa v1.0.10 h1:wk9I21yg+fKdL6HJB9WgGiyXIiu1VttumJwmIRwn0g8= | ||
github.com/ikawaha/kagome-dict/ipa v1.0.10/go.mod h1:rbaOKrF58zhtpV2+2sVZBj0sUSp9dVKPjr660MehJbs= | ||
github.com/ikawaha/kagome-dict/uni v1.1.9 h1:cyKLswS8DSjUPTwsOjlC4WEqRkndUUVgiJR0lcFqZUk= | ||
github.com/ikawaha/kagome-dict/uni v1.1.9/go.mod h1:xg/2qumqt+/s8DhDGYGIU7a+q9ori8ymFvDBtcAVmgc= | ||
github.com/mattn/go-sqlite3 v1.14.17 h1:mCRHCLDUBXgpKAqIKsaAaAsrAlbkeomtRFKXh2L6YIM= | ||
github.com/mattn/go-sqlite3 v1.14.17/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg= | ||
github.com/zenizh/go-capturer v0.0.0-20211219060012-52ea6c8fed04 h1:qXafrlZL1WsJW5OokjraLLRURHiw0OzKHD/RNdspp4w= | ||
github.com/zenizh/go-capturer v0.0.0-20211219060012-52ea6c8fed04/go.mod h1:FiwNQxz6hGoNFBC4nIx+CxZhI3nne5RmIOlT/MXcSD4= | ||
golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g= | ||
golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= |