From 0c8897d974ef718de81ccaf59603d1f5520779c2 Mon Sep 17 00:00:00 2001 From: Yacine Petitprez Date: Mon, 2 Jul 2018 14:15:00 +0200 Subject: [PATCH] WIP on TSVector --- manual/README.md | 5 +- .../FullTextSearchable.md | 35 ++++++ .../{querying => extensions/jsonb}/Jsonb.md | 0 spec/model/full_text_searchable_spec.cr | 43 ++++++-- .../extensions/full_text_searchable/model.cr | 48 ++++++++- .../full_text_searchable/tsvector.cr | 101 ++++++++++++++++++ src/clear/model/column.cr | 2 +- 7 files changed, 219 insertions(+), 15 deletions(-) create mode 100644 manual/extensions/full_text_searchable/FullTextSearchable.md rename manual/{querying => extensions/jsonb}/Jsonb.md (100%) create mode 100644 src/clear/extensions/full_text_searchable/tsvector.cr diff --git a/manual/README.md b/manual/README.md index 26523c0bf..67eb6b6ec 100644 --- a/manual/README.md +++ b/manual/README.md @@ -32,6 +32,7 @@ If you're in hurry and already at ease with Active Record pattern, a simple stan - [Transactions And savepoints](querying/Transaction.md) - [Pagination](querying/Pagination.md) -## Advanced +## Extensions -- [JSONB integration](querying/Jsonb.md) \ No newline at end of file +- [JSONB integration](extensions/jsonb/Jsonb.md) +- [TSVector and Full Text Search](extensions/full_text_searchable/FullTextSearchable.md) \ No newline at end of file diff --git a/manual/extensions/full_text_searchable/FullTextSearchable.md b/manual/extensions/full_text_searchable/FullTextSearchable.md new file mode 100644 index 000000000..36adbcf61 --- /dev/null +++ b/manual/extensions/full_text_searchable/FullTextSearchable.md @@ -0,0 +1,35 @@ +Full text search plugin offers full integration with `tsvector` capabilities of +Postgresql. + +It allows you to query models through the text content of one or multiple fields. + +### The blog example + +Let's assume we have a blog and want to implement full text search over title and content: + +```crystal + create_table "posts" do |t| + t.string "title", nullable: false + t.string "content", nullable: false + + t.full_text_searchable on: [{"title", 'A'}, {"content", 'C'}] + end +``` + +This migration will create a 3rd column named `full_text_vector` of type `tsvector`, +a gin index, a trigger and a function to update automatically this column. + +Over the `on` keyword, '{"title", 'A'}' means it allows search of the content of "title", with level of priority (weight) "A", which tell postgres than title content is more meaningful than the article content itself. + +Now, let's build some models: + +```crystal + Post.create!({title: "About poney", content: "Poney are cool"}) + Post.create!({title: "About dog and cat", content: "Cat and dog are cool. But not as much as poney"}) + Post.create!({title: "You won't believe: She raises her poney like as star!", content: "She's col because poney are cool"}) +``` + +Search is now easily done +``` + Post.query.search("poney") # Return all the articles ! +``` \ No newline at end of file diff --git a/manual/querying/Jsonb.md b/manual/extensions/jsonb/Jsonb.md similarity index 100% rename from manual/querying/Jsonb.md rename to manual/extensions/jsonb/Jsonb.md diff --git a/spec/model/full_text_searchable_spec.cr b/spec/model/full_text_searchable_spec.cr index 66fdda097..40829a68c 100644 --- a/spec/model/full_text_searchable_spec.cr +++ b/spec/model/full_text_searchable_spec.cr @@ -33,14 +33,16 @@ module FullTextSearchableSpec describe "test tsv searchable" do it "Can translate client query to ts_query" do - Clear::Model::FullTextSearchable.text_to_search("rick & morty").should eq("'rick' | '&' | 'morty'") - Clear::Model::FullTextSearchable.text_to_search("rick+morty").should eq("'rick morty'") - Clear::Model::FullTextSearchable.text_to_search("\"rick morty\"").should eq("'rick morty'") - Clear::Model::FullTextSearchable.text_to_search("'rick morty'").should eq("'rick morty'") - Clear::Model::FullTextSearchable.text_to_search("rick morty").should eq("'rick' | 'morty'") - Clear::Model::FullTextSearchable.text_to_search("rick -morty").should eq("'rick' & !'morty'") - Clear::Model::FullTextSearchable.text_to_search("rick -'rick hunter'").should eq("'rick' & !'rick hunter' ") - Clear::Model::FullTextSearchable.text_to_search("l'esplanade").should eq("'l' | 'esplanade'") + Clear::Model::FullTextSearchable.to_tsq("rick & morty").should eq("'rick & morty'") + Clear::Model::FullTextSearchable.to_tsq("rick+morty").should eq("'rick morty'") + Clear::Model::FullTextSearchable.to_tsq("\"rick morty\"").should eq("'rick morty'") + Clear::Model::FullTextSearchable.to_tsq("'rick morty'").should eq("'rick morty'") + Clear::Model::FullTextSearchable.to_tsq("rick morty").should eq("'rick' | 'morty'") + Clear::Model::FullTextSearchable.to_tsq("rick -morty").should eq("'rick' & !'morty'") + Clear::Model::FullTextSearchable.to_tsq("rick -'rick hunter'").should eq("'rick' & !'rick hunter' ") + Clear::Model::FullTextSearchable.to_tsq("l'esplanade").should eq("'l''esplanade'") + Clear::Model::FullTextSearchable.to_tsq("'l''usine'").should eq("'l''usine'") + Clear::Model::FullTextSearchable.to_tsq("'l'usine").should eq("'l''usine'") end it "Can search through TS vector" do @@ -62,4 +64,29 @@ module FullTextSearchableSpec end end end + + describe "Clear::TSVector" do + it "can be encoded/decoded" do + data = ("\u0000\u0000\u0000\tbad\u0000\u0000\u0001@\fbetter\u0000\u0000\u0001" + + "\xC0\u0001break\u0000\u0000\u0001@\vcall\u0000\u0000\u0001\xC0\u0002" + + "follow\u0000\u0000\u0001@\u0004goodman\u0000\u0000\u0001@\u0006lawyer" + + "\u0000\u0000\u0001@\tsaul\u0000\u0000\u0002\xC0\u0003@\u0005sketchi" + + "\u0000\u0000\u0001@\b").bytes + # Example specs + tsvec = Clear::TSVector.decode( + Slice(UInt8).new(data.to_unsafe, data.size) + ) + + tsvec["bad"].positions[0].position.should eq(12) + tsvec["bad"].positions[0].weight.should eq('A') + + tsvec["follow"].positions[0].position.should eq(4) + tsvec["follow"].positions[0].weight.should eq('A') + + tsvec["other"]?.should be_nil + tsvec.to_db.should eq "'bad':12A 'better':1A 'break':11A 'call':2A " + + "'follow':4A 'goodman':6A 'lawyer':9A " + + "'saul':3A,5A 'sketchi':8A" + end + end end diff --git a/src/clear/extensions/full_text_searchable/model.cr b/src/clear/extensions/full_text_searchable/model.cr index 92ca89943..f510435b6 100644 --- a/src/clear/extensions/full_text_searchable/model.cr +++ b/src/clear/extensions/full_text_searchable/model.cr @@ -1,11 +1,38 @@ +require "./tsvector" + module Clear::Model::FullTextSearchable # Set this model as searchable using tsvector macro full_text_searchable(through = "full_text_vector", catalog = "pg_catalog.english") # TODO: Use converter and tsv structure - column( {{through.id}} : String, presence: false ) + column( {{through.id}} : Clear::TSVector, presence: false, converter: Clear::TSVector::Converter ) scope "search" do |str| - where{ op({{through.id}}, to_tsquery({{catalog}}, str), "@@") } + where{ op({{through.id}}, to_tsquery({{catalog}}, + Clear::Model::FullTextSearchable.to_tsq(str)), "@@") } + end + end + + # :nodoc: + # Split a chain written by a user + # A problem to solve is the `'` character + def self.split_to_exp(text) + in_quote = false + quote_start = nil + ignore_next_quote = false + exp = [] of String + text.chars.each_with_index do |c, idx| + case c + when /[A-Z0-9]/i + # if it's a alphanumerical character + ignore_next_quote = true + ignore_next_quote + when '\'', '"' + if (in_quote && quote_start == c) + end + + in_quote = true + quote_start = c + end end end @@ -14,7 +41,20 @@ module Clear::Model::FullTextSearchable # Author note: pg `to_tsquery` is awesome but can easily fail to parse. # `search` method use then a wrapper text_to_search used to ensure than # request is understood and produce ALWAYS legal string for `to_tsquery` - def self.text_to_search(text) - text + def self.to_tsq(text) + return text + current_str = "" + in_quote = false + text.chars.each_with_index do |c, idx| + case c + when '\'' + in_quote = !in_quote + if (!in_quote) + current_str + end + when '-' + else + end + end end end diff --git a/src/clear/extensions/full_text_searchable/tsvector.cr b/src/clear/extensions/full_text_searchable/tsvector.cr new file mode 100644 index 000000000..b604ee85b --- /dev/null +++ b/src/clear/extensions/full_text_searchable/tsvector.cr @@ -0,0 +1,101 @@ +class Clear::TSVector + struct Lexem + record Position, weight : Char, position : UInt16 + + getter value : String = "" + getter positions : Array(Position) = [] of Position + + WEIGHTS = ['A', 'B', 'C', 'D'] + + def initialize(io) + chars = [] of UInt8 + + while ((c = io.read_byte.not_nil!) != 0) + chars << c + end + + @value = String.new(chars.to_unsafe, chars.size) + + pos_size : UInt16 = 0_u16 + + pos_size |= io.read_byte.not_nil! << 8 + pos_size |= io.read_byte.not_nil! << 0 + + pos_size.times do + pos_off_and_weight : UInt16 = 0_u16 + + pos_off_and_weight |= io.read_byte.not_nil! << 8 + pos_off_and_weight |= io.read_byte.not_nil! << 0 + + w = WEIGHTS[(pos_off_and_weight & 0xC000) >> 14] + + @positions << Position.new(w, pos_off_and_weight & (~0xC000)) + end + end + end + + getter lexems : Hash(String, Lexem) = {} of String => Lexem + + def [](key : String) + lexems[key] + end + + def []?(key : String) + lexems[key]? + end + + def to_db + @lexems.values.map do |v| + { + Clear::Expression[v.value], + v.positions.map { |p| {p.position, p.weight}.join }.join(","), + }.join(":") + end.join(" ") + end + + def initialize(io) + size : UInt32 = 0 + + size |= io.read_byte.not_nil! << 24 + size |= io.read_byte.not_nil! << 16 + size |= io.read_byte.not_nil! << 8 + size |= io.read_byte.not_nil! << 0 + + size.times.each do |x| + l = Lexem.new(io) + @lexems[l.value] = l + end + end + + def self.decode(x : Slice(UInt8)) + io = IO::Memory.new(x, writeable: false) + Clear::TSVector.new(io) + end + + module Converter + def self.to_column(x) : Clear::TSVector? + case x + when String + data = x + s = Slice(UInt8).new(data.to_unsafe, data.size) + return Clear::TSVector.decode(s) + when Slice(UInt8) + return Clear::TSVector.decode(x) + when Clear::TSVector + return x + when Nil + return nil + else + raise "Cannot convert #{x.class} to TSVector" + end + end + + def self.to_db(x : TSVector?) + if (x) + x.to_db + else + nil + end + end + end +end diff --git a/src/clear/model/column.cr b/src/clear/model/column.cr index dd704a8e1..668564183 100644 --- a/src/clear/model/column.cr +++ b/src/clear/model/column.cr @@ -50,7 +50,7 @@ class Clear::Model::Column(T) @value = x.as(T) else raise "Your field `#{@name}` is declared as `#{T}` but `NULL` value has been found in the database.\n" + - "Maybe declaring it as `#{T}?` would fix the mess !" if x.nil? + "Maybe declaring it as `#{T}?` would fix the error." if x.nil? @value = x.not_nil! end