Backed to acts_as_ferret stable branch (ferret 0.9.6)
authorNguyễn Thái Ngọc Duy <[email protected]>
Fri, 25 Aug 2006 10:14:58 +0000 (25 17:14 +0700)
committerNguyễn Thái Ngọc Duy <[email protected]>
Fri, 25 Aug 2006 10:14:58 +0000 (25 17:14 +0700)
vendor/plugins/acts_as_ferret/lib/acts_as_ferret.rb
vendor/plugins/acts_as_ferret/lib/multi_index.rb

index ec8bded..644d333 100644 (file)
 require 'active_record'
 require 'set'
 
-# 0.10 problems
-# Ferret::Search::Similarity, Ferret::Search::Similarity.default missing
-# IndexReader#latest? segfaults when used on multiple indexes
-# :offset and :limit get ignored by search_each
-# query_parser ignores or_default
 
 # Yet another Ferret Mixin.
 #
@@ -98,42 +93,35 @@ module FerretMixin
         # helper that defines a method that adds the given field to a lucene 
         # document instance
         def define_to_field_method(field, options = {})         
-          options = { 
-            :store => :no, 
-            :index => :yes, 
-            :term_vector => :with_positions_offsets,
-            :boost => 1.0 }.update(options)
-          fields_for_ferret[field] = options
-          define_method("#{field}_to_ferret".to_sym) do
+          default_opts = { :store => Ferret::Document::Field::Store::NO, 
+            :index => Ferret::Document::Field::Index::TOKENIZED, 
+            :term_vector => Ferret::Document::Field::TermVector::NO,
+            :binary => false,
+            :boost => 1.0
+          }
+          default_opts.update(options) if options.is_a?(Hash) 
+          fields_for_ferret << field 
+          define_method("#{field}_to_ferret".to_sym) do                              
             begin
+              #val = self[field] || self.instance_variable_get("@#{field.to_s}".to_sym) || self.method(field).call
               val = content_for_field_name(field)
             rescue
-              logger.warn("Error retrieving value for field #{field}: #{$!}")
+              logger.debug("Error retrieving value for field #{field}: #{$!}")
               val = ''
             end
             logger.debug("Adding field #{field} with value '#{val}' to index")
-            val
-          end
-        end
-
-        def add_fields(field_config)
-          if field_config.respond_to?(:each_pair)
-            field_config.each_pair do |key,val|
-              define_to_field_method(key,val)                  
-            end
-          elsif field_config.respond_to?(:each)
-            field_config.each do |field| 
-              define_to_field_method(field)
-            end                
+            Ferret::Document::Field.new(field.to_s, val, 
+                                        default_opts[:store], 
+                                        default_opts[:index], 
+                                        default_opts[:term_vector], 
+                                        default_opts[:binary], 
+                                        default_opts[:boost]) 
           end
         end
         
         # TODO: do we need to define this at this level ? Maybe it's
         # sufficient to do this only in classes calling acts_as_ferret ?
-        #
-        # moved below inside class_eval in #acts_as_ferret, let's see 
-        # what happens ;-)
-        #def reloadable?; false end
+        def reloadable?; false end
         
         @@ferret_indexes = Hash.new
         def ferret_indexes; @@ferret_indexes end
@@ -148,13 +136,7 @@ module FerretMixin
         # fields:: names all fields to include in the index. If not given,
         #   all attributes of the class will be indexed. You may also give
         #   symbols pointing to instance methods of your model here, i.e. 
-        #   to retrieve and index data from a related model. 
-        #
-        # additional_fields:: names fields to include in the index, in addition 
-        #   to those derived from the db scheme. use if you want to add
-        #   custom fields derived from methods to the db fields (which will be picked 
-        #   by aaf). This option will be ignored when the fields option is given, in 
-        #   that case additional fields get specified there.
+        #   to retrieve and index data from a related model.
         #
         # index_dir:: declares the directory where to put the index for this class.
         #   The default is RAILS_ROOT/index/RAILS_ENV/CLASSNAME. 
@@ -174,30 +156,30 @@ module FerretMixin
         # default value is 1000
         #
         # ferret_options may be:
-        # or_default:: - whether query terms are required by
-        #   default (the default, false), or not (true)
+        # occur_default:: - whether query terms are required by
+        #   default (the default), or not. Specify one of 
+        #   Ferret::Search::BooleanClause::Occur::MUST or 
+        #   Ferret::Search::BooleanClause::Occur::SHOULD
         # 
         # analyzer:: the analyzer to use for query parsing (default: nil,
-        #   wihch means the ferret StandardAnalyzer gets used)
+        #   wihch means the ferret default Analyzer gets used)
         #
         def acts_as_ferret(options={}, ferret_options={})
           configuration = { 
+            :fields => nil,
             :index_dir => "#{FerretMixin::Acts::ARFerret::index_dir}/#{self.name.underscore}",
             :store_class_name => false,
             :single_index => false,
             :max_results => 1000
           }
           ferret_configuration = {
-            #:or_default => false, # lead to 'cannot convert false to Integer'
-            #in some cases
-            :handle_parser_errors => true
-            #:max_clauses => 512,
-            #:default_field => '*',
-            #:analyzer => Ferret::Analysis::StandardAnalyzer.new,
-            # :wild_card_downcase => true
+            :occur_default => Ferret::Search::BooleanClause::Occur::MUST,
+            :handle_parse_errors => true,
+            :default_search_field => '*',
+            :analyzer => Ferret::Analysis::StandardAnalyzer.new,
+            # :wild_lower => true
           }
           configuration.update(options) if options.is_a?(Hash)
-
           # apply appropriate settings for shared index
           if configuration[:single_index] 
             configuration[:index_dir] = "#{FerretMixin::Acts::ARFerret::index_dir}/shared" 
@@ -207,8 +189,7 @@ module FerretMixin
           # these properties are somewhat vital to the plugin and shouldn't
           # be overwritten by the user:
           ferret_configuration.update(
-
-            :key               => (configuration[:single_index] ? [:id, :class_name] : :id),
+            :key               => (configuration[:single_index] ? ['id', 'class_name'] : 'id'),
             :path              => configuration[:index_dir],
             :auto_flush        => true,
             :create_if_missing => true
@@ -227,18 +208,21 @@ module FerretMixin
               cattr_accessor :configuration
               cattr_accessor :ferret_configuration
               
-              @@fields_for_ferret = Hash.new
+              @@fields_for_ferret = Array.new
               @@configuration = configuration
               @@ferret_configuration = ferret_configuration
-
-              if configuration[:fields]
-                add_fields(configuration[:fields])
+              
+              if configuration[:fields].respond_to?(:each_pair)
+                configuration[:fields].each_pair do |key,val|
+                  define_to_field_method(key,val)                  
+                end
+              elsif configuration[:fields].respond_to?(:each)
+                configuration[:fields].each do |field| 
+                  define_to_field_method(field)
+                end                
               else
-                add_fields(self.new.attributes.keys.map { |k| k.to_sym })
-                add_fields(configuration[:additional_fields])
+                @@fields_for_ferret = nil
               end
-
-              def self.reloadable?; false end
             EOV
           FerretMixin::Acts::ARFerret::ensure_directory configuration[:index_dir]
         end
@@ -258,36 +242,11 @@ module FerretMixin
         # When calling this method manually, you can give any additional 
         # model classes that should also go into this index as parameters. 
         # Useful when using the :single_index option.
-        # Note that attributes named the same in different models will share
-        # the same field options in the shared index.
-        def rebuild_index(*models)
-          models << self
-          # default attributes for fields
-          fi = Ferret::Index::FieldInfos.new(:store => :no, 
-                                             :index => :yes, 
-                                             :term_vector => :no,
-                                             :boost => 1.0)
-          # primary key
-          fi.add_field(:id, :store => :yes, :index => :untokenized) 
-          # class_name
-          if configuration[:store_class_name]
-            fi.add_field(:class_name, :store => :yes, :index => :untokenized) 
-          end
-          # collect field options from all models
-          fields = {}
-          models.each do |model|
-            fields.update(model.fields_for_ferret)
-          end
-          logger.debug("class #{self.name}: fields for index: #{fields.keys.join(',')}")
-          fields.each_pair do |field, options|
-            fi.add_field(field, { :store => :no, 
-                                  :index => :yes }.update(options)) 
-          end
-          fi.create_index(ferret_configuration[:path])
-
-          index = Ferret::Index::Index.new(ferret_configuration.dup.update(:auto_flush => false))
+        def rebuild_index(*additional_models)
+          index = Ferret::Index::Index.new(ferret_configuration.merge(:create => true))
+          additional_models << self
           batch_size = 1000
-          models.each do |model|
+          additional_models.each do |model|
             # index in batches of 1000 to limit memory consumption (fixes #24)
             model.transaction do
               0.step(model.count, batch_size) do |i|
@@ -381,7 +340,7 @@ module FerretMixin
         # determine all field names in the shared index
         def single_index_field_names(models)
           @single_index_field_names ||= (
-              searcher = Ferret::Search::Searcher.new(class_index_dir)
+              searcher = Ferret::Search::IndexSearcher.new(class_index_dir)
               if searcher.reader.respond_to?(:get_field_names)
                 (searcher.reader.send(:get_field_names) - ['id', 'class_name']).to_a
               else
@@ -414,19 +373,17 @@ module FerretMixin
               #  class_clauses << "class_name:#{model}"
               #end
               #q << " AND (#{class_clauses.join(' OR ')})"
-
-              qp = Ferret::QueryParser.new (ferret_configuration)
-              qp.fields = ferret_index.send(:reader).field_names
+              qp = Ferret::QueryParser.new(ferret_configuration[:default_search_field], ferret_configuration.update(:fields => single_index_field_names(options[:models])))
               original_query = qp.parse(q)
             end
             #else
             q = Ferret::Search::BooleanQuery.new
-            q.add_query(original_query, :must)
+            q.add_query(original_query, Ferret::Search::BooleanClause::Occur::MUST)
             model_query = Ferret::Search::BooleanQuery.new
             options[:models].each do |model|
-              model_query.add_query(Ferret::Search::TermQuery.new(:class_name, model.name), :should)
+              model_query.add_query(Ferret::Search::TermQuery.new(Ferret::Index::Term.new('class_name', model.name)), Ferret::Search::BooleanClause::Occur::SHOULD)
             end
-            q.add_query(model_query, :must)
+            q.add_query(model_query, Ferret::Search::BooleanClause::Occur::MUST)
             #end
           end
           #puts q.to_s
@@ -474,14 +431,11 @@ module FerretMixin
         # instead of the result list!
         # 
         def find_id_by_contents(q, options = {})
-          deprecated_options_support(options)
-          options[:limit] = configuration[:max_results] if options[:limit] == :all
-
+          options[:num_docs] = configuration[:max_results] if options[:num_docs] == :all
           result = []
           index = self.ferret_index
-          #hits = index.search(q, options)
-          #hits.each do |hit, score|
-          total_hits = index.search_each(q, options) do |hit, score|
+          hits = index.search(q, options)
+          hits.each do |hit, score|
             # only collect result data if we intend to return it
             doc = index[hit]
             model = configuration[:store_class_name] ? doc[:class_name] : self.name
@@ -492,7 +446,7 @@ module FerretMixin
             end
           end
           logger.debug "id_score_model array: #{result.inspect}"
-          return block_given? ? total_hits : result
+          return block_given? ? hits.total_hits : result
         end
         
         # requires the store_class_name option of acts_as_ferret to be true
@@ -515,20 +469,20 @@ module FerretMixin
         # be yielded, and the total number of hits is returned.
         #
         def id_multi_search(query, additional_models = [], options = {})
-          deprecated_options_support(options)
-          options[:limit] = configuration[:max_results] if options[:limit] == :all
+          options[:num_docs] = configuration[:max_results] if options[:num_docs] == :all
           additional_models << self
           searcher = multi_index(additional_models)
           result = []
-          total_hits = searcher.search_each (query, options) do |hit, score|
-            doc = searcher[hit]
+          hits = searcher.search(query, options)
+          hits.each { |hit, score|
+            doc = searcher.doc(hit)
             if block_given?
               yield doc[:class_name], doc[:id].to_i, score
             else
               result << { :model => doc[:class_name], :id => doc[:id], :score => score }
             end
-          end
-          return block_given? ? total_hits : result
+          }
+          return block_given? ? hits.total_hits : result
         end
         
         # returns a MultiIndex instance operating on a MultiReader
@@ -537,17 +491,7 @@ module FerretMixin
           key = model_classes.inject("") { |s, clazz| s << clazz.name }
           @@multi_indexes[key] ||= MultiIndex.new(model_classes, ferret_configuration)
         end
-
-        def deprecated_options_support(options)
-          if options[:num_docs]
-            logger.warn ":num_docs is deprecated, use :limit instead!"
-            options[:limit] ||= options[:num_docs]
-          end
-          if options[:first_doc]
-            logger.warn ":first_doc is deprecated, use :offset instead!"
-            options[:offset] ||= options[:first_doc]
-          end
-        end
+        
       end
       
       
@@ -563,9 +507,7 @@ module FerretMixin
         # add to index
         def ferret_create
           logger.debug "ferret_create/update: #{self.class.name} : #{self.id}"
-          if @ferret_reindex
-            self.class.ferret_index << self.to_doc
-          end
+          self.class.ferret_index << self.to_doc if @ferret_reindex
           @ferret_reindex = true
           true
         end
@@ -575,11 +517,12 @@ module FerretMixin
         def ferret_destroy
           logger.debug "ferret_destroy: #{self.class.name} : #{self.id}"
           begin
-            query = Ferret::Search::TermQuery.new(:id, self.id.to_s)
+            query = Ferret::Search::TermQuery.new(Ferret::Index::Term.new('id',self.id.to_s))
             if self.class.configuration[:single_index]
               bq = Ferret::Search::BooleanQuery.new
-              bq.add_query(query, :must)
-              bq.add_query(Ferret::Search::TermQuery.new(:class_name, self.class.name), :must)
+              bq.add_query(query, Ferret::Search::BooleanClause::Occur::MUST)
+              bq.add_query(Ferret::Search::TermQuery.new(Ferret::Index::Term.new('class_name', self.class.name)),
+                           Ferret::Search::BooleanClause::Occur::MUST)
               query = bq
             end
             self.class.ferret_index.query_delete(query)
@@ -593,31 +536,36 @@ module FerretMixin
         def to_doc
           logger.debug "creating doc for class: #{self.class.name}, id: #{self.id}"
           # Churn through the complete Active Record and add it to the Ferret document
-          doc = Ferret::Document.new
+          doc = Ferret::Document::Document.new
           # store the id of each item
-          doc[:id] = self.id
-
+          doc << Ferret::Document::Field.new( "id", self.id, 
+          Ferret::Document::Field::Store::YES, 
+          Ferret::Document::Field::Index::UNTOKENIZED )
           # store the class name if configured to do so
           if configuration[:store_class_name]
-            doc[:class_name] = self.class.name
+            doc << Ferret::Document::Field.new( "class_name", self.class.name,
+            Ferret::Document::Field::Store::YES, 
+            Ferret::Document::Field::Index::UNTOKENIZED ) # have to tokenize to be able to use class_name field in queries ?!
           end
           # iterate through the fields and add them to the document
-          #if fields_for_ferret
+          if fields_for_ferret
             # have user defined fields
-          fields_for_ferret.each_pair do |field, config|
-            doc[field] = self.send("#{field}_to_ferret") unless config[:ignore]
-          end
-          #else
+            fields_for_ferret.each do |field|
+              doc << self.send("#{field}_to_ferret")
+            end
+          else
             # take all fields
-            # TODO shouldn't be needed any more
-          #  puts "remove me!"
-          #  self.attributes.each_pair do |key,val|
-          #    unless key == :id
-          #      logger.debug "add field #{key} with value #{val}"
-          #      doc[key] = val.to_s
-          #    end
-           # end
-          #end
+            self.attributes.each_pair do |key,val|
+              unless key == :id
+                logger.debug "add field #{key} with value #{val}"
+                doc << Ferret::Document::Field.new(
+                                           key, 
+                                           val.to_s, 
+                                           Ferret::Document::Field::Store::NO, 
+                                           Ferret::Document::Field::Index::TOKENIZED)
+              end
+            end
+          end
           return doc
         end
 
@@ -699,7 +647,7 @@ module FerretMixin
               term_query.boost = cur.score / best_score
             end
             begin
-              query.add_query(term_query, :should
+              query.add_query(term_query, Ferret::Search::BooleanClause::Occur::SHOULD
             rescue Ferret::Search::BooleanQuery::TooManyClauses
               break
             end
@@ -708,7 +656,8 @@ module FerretMixin
           end
           # exclude ourselves
           t = Ferret::Index::Term.new('id', self.id.to_s)
-          query.add_query(Ferret::Search::TermQuery.new(t), :must_not)
+          query.add_query(Ferret::Search::TermQuery.new(t),
+                          Ferret::Search::BooleanClause::Occur::MUST_NOT)
           return query
         end
 
index 4887e9b..87eea78 100644 (file)
@@ -4,14 +4,15 @@ module FerretMixin
       # not threadsafe
       class MultiIndex
         
+        attr_reader :reader
+        
         # todo: check for necessary index rebuilds in this place, too
         # idea - each class gets a create_reader method that does this
         def initialize(model_classes, options = {})
           @model_classes = model_classes
           @options = { 
-            :default_field => '*',
-            #:analyzer => Ferret::Analysis::WhiteSpaceAnalyzer.new
-            :analyzer => Ferret::Analysis::StandardAnalyzer.new
+            :default_search_field => '*',
+            :analyzer => Ferret::Analysis::WhiteSpaceAnalyzer.new
           }.update(options)
         end
         
@@ -22,35 +23,39 @@ module FerretMixin
           searcher.search(query, options)
         end
 
-        def search_each(query, options={}, &block)
-          query = process_query(query)
-          searcher.search_each(query, options={}, &block)
-        end
-
         # checks if all our sub-searchers still are up to date
         def latest?
-          return false unless @reader
-          # segfaults with 0.10.0 --> TODO report as bug @reader.latest?
-          @sub_readers.each do |r| 
-            return false unless r.latest? 
+          return false unless @searcher
+          @sub_searchers.each do |s| 
+            return false unless s.reader.latest? 
           end
           true
         end
 
         def ensure_searcher
           unless latest?
-            #field_names = Set.new
-            @sub_readers = @model_classes.map { |clazz| 
+            field_names = Set.new
+            @sub_searchers = @model_classes.map { |clazz| 
               begin
-                reader = Ferret::Index::IndexReader.new(clazz.class_index_dir)
+                searcher = Ferret::Search::IndexSearcher.new(clazz.class_index_dir)
               rescue Exception
                 puts "error opening #{clazz.class_index_dir}: #{$!}"
               end
-            #  field_names << reader.field_names.to_set
-              reader
+              if searcher.reader.respond_to?(:get_field_names)
+                field_names << searcher.reader.send(:get_field_names).to_set
+              elsif clazz.fields_for_ferret
+                field_names << clazz.fields_for_ferret.to_set
+              else
+                puts <<-END
+  unable to retrieve field names for class #{clazz.name}, please 
+  consider naming all indexed fields in your call to acts_as_ferret!
+                END
+                clazz.content_columns.each { |col| field_names << col.name }
+              end
+              searcher
             }
-            @reader = Ferret::Index::IndexReader.new(@sub_readers)
-            @searcher = Ferret::Search::Searcher.new(@reader)
+            @searcher = Ferret::Search::MultiSearcher.new(@sub_searchers)
+            @field_names = field_names.flatten.to_a
             @query_parser = nil # trigger re-creation from new field_name array
           end
         end
@@ -61,16 +66,17 @@ module FerretMixin
         end
         
         def doc(i)
-          searcher[i]
+          searcher.doc(i)
         end
-        alias :[] :doc
         
         def query_parser
-          ensure_searcher 
           unless @query_parser
-            @query_parser ||= Ferret::QueryParser.new(@options)
+            ensure_searcher # we dont need the searcher, but the @field_names array is built by this function, too
+            @query_parser ||= Ferret::QueryParser.new(
+                                @options[:default_search_field],
+                                { :fields => @field_names }.merge(@options)
+                              )
           end
-          @query_parser.fields = @reader.field_names
           @query_parser
         end