Skip to content

Commit

Permalink
WIP on TSVector
Browse files Browse the repository at this point in the history
  • Loading branch information
Yacine Petitprez committed Jul 2, 2018
1 parent 369e832 commit 0c8897d
Show file tree
Hide file tree
Showing 7 changed files with 219 additions and 15 deletions.
5 changes: 3 additions & 2 deletions manual/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ If you're in hurry and already at ease with Active Record pattern, a simple stan
- [Transactions And savepoints](querying/Transaction.md)
- [Pagination](querying/Pagination.md)

## Advanced
## Extensions

- [JSONB integration](querying/Jsonb.md)
- [JSONB integration](extensions/jsonb/Jsonb.md)
- [TSVector and Full Text Search](extensions/full_text_searchable/FullTextSearchable.md)
35 changes: 35 additions & 0 deletions manual/extensions/full_text_searchable/FullTextSearchable.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
Full text search plugin offers full integration with `tsvector` capabilities of
Postgresql.

It allows you to query models through the text content of one or multiple fields.

### The blog example

Let's assume we have a blog and want to implement full text search over title and content:

```crystal
create_table "posts" do |t|
t.string "title", nullable: false
t.string "content", nullable: false
t.full_text_searchable on: [{"title", 'A'}, {"content", 'C'}]
end
```

This migration will create a 3rd column named `full_text_vector` of type `tsvector`,
a gin index, a trigger and a function to update automatically this column.

Over the `on` keyword, '{"title", 'A'}' means it allows search of the content of "title", with level of priority (weight) "A", which tell postgres than title content is more meaningful than the article content itself.

Now, let's build some models:

```crystal
Post.create!({title: "About poney", content: "Poney are cool"})
Post.create!({title: "About dog and cat", content: "Cat and dog are cool. But not as much as poney"})
Post.create!({title: "You won't believe: She raises her poney like as star!", content: "She's col because poney are cool"})
```

Search is now easily done
```
Post.query.search("poney") # Return all the articles !
```
File renamed without changes.
43 changes: 35 additions & 8 deletions spec/model/full_text_searchable_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,16 @@ module FullTextSearchableSpec

describe "test tsv searchable" do
it "Can translate client query to ts_query" do
Clear::Model::FullTextSearchable.text_to_search("rick & morty").should eq("'rick' | '&' | 'morty'")
Clear::Model::FullTextSearchable.text_to_search("rick+morty").should eq("'rick morty'")
Clear::Model::FullTextSearchable.text_to_search("\"rick morty\"").should eq("'rick morty'")
Clear::Model::FullTextSearchable.text_to_search("'rick morty'").should eq("'rick morty'")
Clear::Model::FullTextSearchable.text_to_search("rick morty").should eq("'rick' | 'morty'")
Clear::Model::FullTextSearchable.text_to_search("rick -morty").should eq("'rick' & !'morty'")
Clear::Model::FullTextSearchable.text_to_search("rick -'rick hunter'").should eq("'rick' & !'rick hunter' ")
Clear::Model::FullTextSearchable.text_to_search("l'esplanade").should eq("'l' | 'esplanade'")
Clear::Model::FullTextSearchable.to_tsq("rick & morty").should eq("'rick & morty'")
Clear::Model::FullTextSearchable.to_tsq("rick+morty").should eq("'rick morty'")
Clear::Model::FullTextSearchable.to_tsq("\"rick morty\"").should eq("'rick morty'")
Clear::Model::FullTextSearchable.to_tsq("'rick morty'").should eq("'rick morty'")
Clear::Model::FullTextSearchable.to_tsq("rick morty").should eq("'rick' | 'morty'")
Clear::Model::FullTextSearchable.to_tsq("rick -morty").should eq("'rick' & !'morty'")
Clear::Model::FullTextSearchable.to_tsq("rick -'rick hunter'").should eq("'rick' & !'rick hunter' ")
Clear::Model::FullTextSearchable.to_tsq("l'esplanade").should eq("'l''esplanade'")
Clear::Model::FullTextSearchable.to_tsq("'l''usine'").should eq("'l''usine'")
Clear::Model::FullTextSearchable.to_tsq("'l'usine").should eq("'l''usine'")
end

it "Can search through TS vector" do
Expand All @@ -62,4 +64,29 @@ module FullTextSearchableSpec
end
end
end

describe "Clear::TSVector" do
it "can be encoded/decoded" do
data = ("\u0000\u0000\u0000\tbad\u0000\u0000\u0001@\fbetter\u0000\u0000\u0001" +
"\xC0\u0001break\u0000\u0000\u0001@\vcall\u0000\u0000\u0001\xC0\u0002" +
"follow\u0000\u0000\u0001@\u0004goodman\u0000\u0000\u0001@\u0006lawyer" +
"\u0000\u0000\u0001@\tsaul\u0000\u0000\u0002\xC0\u0003@\u0005sketchi" +
"\u0000\u0000\u0001@\b").bytes
# Example specs
tsvec = Clear::TSVector.decode(
Slice(UInt8).new(data.to_unsafe, data.size)
)

tsvec["bad"].positions[0].position.should eq(12)
tsvec["bad"].positions[0].weight.should eq('A')

tsvec["follow"].positions[0].position.should eq(4)
tsvec["follow"].positions[0].weight.should eq('A')

tsvec["other"]?.should be_nil
tsvec.to_db.should eq "'bad':12A 'better':1A 'break':11A 'call':2A " +
"'follow':4A 'goodman':6A 'lawyer':9A " +
"'saul':3A,5A 'sketchi':8A"
end
end
end
48 changes: 44 additions & 4 deletions src/clear/extensions/full_text_searchable/model.cr
Original file line number Diff line number Diff line change
@@ -1,11 +1,38 @@
require "./tsvector"

module Clear::Model::FullTextSearchable
# Set this model as searchable using tsvector
macro full_text_searchable(through = "full_text_vector", catalog = "pg_catalog.english")
# TODO: Use converter and tsv structure
column( {{through.id}} : String, presence: false )
column( {{through.id}} : Clear::TSVector, presence: false, converter: Clear::TSVector::Converter )

scope "search" do |str|
where{ op({{through.id}}, to_tsquery({{catalog}}, str), "@@") }
where{ op({{through.id}}, to_tsquery({{catalog}},
Clear::Model::FullTextSearchable.to_tsq(str)), "@@") }
end
end

# :nodoc:
# Split a chain written by a user
# A problem to solve is the `'` character
def self.split_to_exp(text)
in_quote = false
quote_start = nil
ignore_next_quote = false
exp = [] of String
text.chars.each_with_index do |c, idx|
case c
when /[A-Z0-9]/i
# if it's a alphanumerical character
ignore_next_quote = true
ignore_next_quote
when '\'', '"'
if (in_quote && quote_start == c)
end

in_quote = true
quote_start = c
end
end
end

Expand All @@ -14,7 +41,20 @@ module Clear::Model::FullTextSearchable
# Author note: pg `to_tsquery` is awesome but can easily fail to parse.
# `search` method use then a wrapper text_to_search used to ensure than
# request is understood and produce ALWAYS legal string for `to_tsquery`
def self.text_to_search(text)
text
def self.to_tsq(text)
return text
current_str = ""
in_quote = false
text.chars.each_with_index do |c, idx|
case c
when '\''
in_quote = !in_quote
if (!in_quote)
current_str
end
when '-'
else
end
end
end
end
101 changes: 101 additions & 0 deletions src/clear/extensions/full_text_searchable/tsvector.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
class Clear::TSVector
struct Lexem
record Position, weight : Char, position : UInt16

getter value : String = ""
getter positions : Array(Position) = [] of Position

WEIGHTS = ['A', 'B', 'C', 'D']

def initialize(io)
chars = [] of UInt8

while ((c = io.read_byte.not_nil!) != 0)
chars << c
end

@value = String.new(chars.to_unsafe, chars.size)

pos_size : UInt16 = 0_u16

pos_size |= io.read_byte.not_nil! << 8
pos_size |= io.read_byte.not_nil! << 0

pos_size.times do
pos_off_and_weight : UInt16 = 0_u16

pos_off_and_weight |= io.read_byte.not_nil! << 8
pos_off_and_weight |= io.read_byte.not_nil! << 0

w = WEIGHTS[(pos_off_and_weight & 0xC000) >> 14]

@positions << Position.new(w, pos_off_and_weight & (~0xC000))
end
end
end

getter lexems : Hash(String, Lexem) = {} of String => Lexem

def [](key : String)
lexems[key]
end

def []?(key : String)
lexems[key]?
end

def to_db
@lexems.values.map do |v|
{
Clear::Expression[v.value],
v.positions.map { |p| {p.position, p.weight}.join }.join(","),
}.join(":")
end.join(" ")
end

def initialize(io)
size : UInt32 = 0

size |= io.read_byte.not_nil! << 24
size |= io.read_byte.not_nil! << 16
size |= io.read_byte.not_nil! << 8
size |= io.read_byte.not_nil! << 0

size.times.each do |x|
l = Lexem.new(io)
@lexems[l.value] = l
end
end

def self.decode(x : Slice(UInt8))
io = IO::Memory.new(x, writeable: false)
Clear::TSVector.new(io)
end

module Converter
def self.to_column(x) : Clear::TSVector?
case x
when String
data = x
s = Slice(UInt8).new(data.to_unsafe, data.size)
return Clear::TSVector.decode(s)
when Slice(UInt8)
return Clear::TSVector.decode(x)
when Clear::TSVector
return x
when Nil
return nil
else
raise "Cannot convert #{x.class} to TSVector"
end
end

def self.to_db(x : TSVector?)
if (x)
x.to_db
else
nil
end
end
end
end
2 changes: 1 addition & 1 deletion src/clear/model/column.cr
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class Clear::Model::Column(T)
@value = x.as(T)
else
raise "Your field `#{@name}` is declared as `#{T}` but `NULL` value has been found in the database.\n" +
"Maybe declaring it as `#{T}?` would fix the mess !" if x.nil?
"Maybe declaring it as `#{T}?` would fix the error." if x.nil?
@value = x.not_nil!
end

Expand Down

0 comments on commit 0c8897d

Please sign in to comment.