summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorأحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@users.sourceforge.net>2015-05-07 11:22:20 (GMT)
committerأحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@users.sourceforge.net>2015-05-07 11:22:20 (GMT)
commit50b9306f9b4e3cd4b08dcc2f5bcb39ade6d0c32c (patch)
tree7a1733a984c947f52e7d3a6a19bdfa9f35908ba9
downloadpython-whoosh-50b9306f9b4e3cd4b08dcc2f5bcb39ade6d0c32c.zip
python-whoosh-50b9306f9b4e3cd4b08dcc2f5bcb39ade6d0c32c.tar.gz
python-whoosh-50b9306f9b4e3cd4b08dcc2f5bcb39ade6d0c32c.tar.bz2
python-whoosh (2.7.0-1) unstable; urgency=medium
* New upstream release. * Update watch file. Thanks to Piotr Ożarowski * debian/copyright: Update copyright years. * debian/upstream/metadata: Added upstream metadata. # imported from the archive
-rw-r--r--LICENSE.txt26
-rw-r--r--MANIFEST.in8
-rw-r--r--PKG-INFO88
-rw-r--r--README.txt68
-rw-r--r--benchmark/dcvgr10.txt.gzbin0 -> 201819 bytes
-rw-r--r--benchmark/dictionary.py43
-rw-r--r--benchmark/enron.py185
-rw-r--r--benchmark/marc21.py297
-rw-r--r--benchmark/reuters.py38
-rw-r--r--benchmark/reuters21578.txt.gzbin0 -> 181938 bytes
-rw-r--r--debian/NEWS8
-rw-r--r--debian/README.source58
-rw-r--r--debian/changelog391
-rw-r--r--debian/compat1
-rw-r--r--debian/control60
-rw-r--r--debian/copyright144
-rw-r--r--debian/python-whoosh-doc.doc-base10
-rw-r--r--debian/python-whoosh-doc.docs1
-rw-r--r--debian/python-whoosh-doc.maintscript1
-rwxr-xr-xdebian/rules16
-rw-r--r--debian/source/format1
-rw-r--r--debian/upstream/metadata5
-rw-r--r--debian/watch3
-rw-r--r--docs/source/analysis.rst329
-rw-r--r--docs/source/api/analysis.rst62
-rw-r--r--docs/source/api/api.rst9
-rw-r--r--docs/source/api/codec/base.rst32
-rw-r--r--docs/source/api/collectors.rst47
-rw-r--r--docs/source/api/columns.rst49
-rw-r--r--docs/source/api/fields.rst41
-rw-r--r--docs/source/api/filedb/filestore.rst31
-rw-r--r--docs/source/api/filedb/filetables.rst22
-rw-r--r--docs/source/api/filedb/structfile.rst14
-rw-r--r--docs/source/api/formats.rst24
-rw-r--r--docs/source/api/highlight.rst50
-rw-r--r--docs/source/api/idsets.rst23
-rw-r--r--docs/source/api/index.rst39
-rw-r--r--docs/source/api/lang/morph_en.rst7
-rw-r--r--docs/source/api/lang/porter.rst7
-rw-r--r--docs/source/api/lang/wordnet.rst20
-rw-r--r--docs/source/api/matching.rst34
-rw-r--r--docs/source/api/qparser.rst97
-rw-r--r--docs/source/api/query.rst83
-rw-r--r--docs/source/api/reading.rst22
-rw-r--r--docs/source/api/scoring.rst42
-rw-r--r--docs/source/api/searching.rst33
-rw-r--r--docs/source/api/sorting.rst48
-rw-r--r--docs/source/api/spelling.rst34
-rw-r--r--docs/source/api/support/charset.rst13
-rw-r--r--docs/source/api/support/levenshtein.rst10
-rw-r--r--docs/source/api/util.rst7
-rw-r--r--docs/source/api/writing.rst30
-rw-r--r--docs/source/batch.rst114
-rw-r--r--docs/source/conf.py198
-rw-r--r--docs/source/dates.rst202
-rw-r--r--docs/source/facets.rst771
-rw-r--r--docs/source/fieldcaches.rst52
-rw-r--r--docs/source/glossary.rst65
-rw-r--r--docs/source/highlight.rst419
-rw-r--r--docs/source/index.rst50
-rw-r--r--docs/source/indexing.rst440
-rw-r--r--docs/source/intro.rst60
-rw-r--r--docs/source/keywords.rst94
-rw-r--r--docs/source/nested.rst238
-rw-r--r--docs/source/ngrams.rst51
-rw-r--r--docs/source/parsing.rst437
-rw-r--r--docs/source/query.rst10
-rw-r--r--docs/source/querylang.rst191
-rw-r--r--docs/source/quickstart.rst244
-rw-r--r--docs/source/recipes.rst229
-rw-r--r--docs/source/releases/0_3.rst61
-rw-r--r--docs/source/releases/1_0.rst482
-rw-r--r--docs/source/releases/2_0.rst333
-rw-r--r--docs/source/releases/index.rst11
-rw-r--r--docs/source/schema.rst377
-rw-r--r--docs/source/searching.rst400
-rw-r--r--docs/source/spelling.rst130
-rw-r--r--docs/source/stemming.rst217
-rw-r--r--docs/source/tech/backend.rst175
-rw-r--r--docs/source/tech/filedb.rst29
-rw-r--r--docs/source/tech/index.rst9
-rw-r--r--docs/source/threads.rst74
-rw-r--r--files/whoosh.svg434
-rw-r--r--files/whoosh_16.pngbin0 -> 909 bytes
-rw-r--r--files/whoosh_35.pngbin0 -> 3231 bytes
-rw-r--r--files/whoosh_64.pngbin0 -> 7708 bytes
-rw-r--r--files/whoosh_small.svg604
-rw-r--r--setup.cfg40
-rw-r--r--setup.py60
-rw-r--r--src/Whoosh.egg-info/PKG-INFO88
-rw-r--r--src/Whoosh.egg-info/SOURCES.txt224
-rw-r--r--src/Whoosh.egg-info/dependency_links.txt1
-rw-r--r--src/Whoosh.egg-info/top_level.txt1
-rw-r--r--src/Whoosh.egg-info/zip-safe1
-rw-r--r--src/whoosh/__init__.py49
-rw-r--r--src/whoosh/analysis/__init__.py69
-rw-r--r--src/whoosh/analysis/acore.py156
-rw-r--r--src/whoosh/analysis/analyzers.py296
-rw-r--r--src/whoosh/analysis/filters.py479
-rw-r--r--src/whoosh/analysis/intraword.py494
-rw-r--r--src/whoosh/analysis/morph.py267
-rw-r--r--src/whoosh/analysis/ngrams.py237
-rw-r--r--src/whoosh/analysis/tokenizers.py338
-rw-r--r--src/whoosh/automata/__init__.py0
-rw-r--r--src/whoosh/automata/fsa.py714
-rw-r--r--src/whoosh/automata/glob.py90
-rw-r--r--src/whoosh/automata/lev.py30
-rw-r--r--src/whoosh/automata/nfa.py388
-rw-r--r--src/whoosh/automata/reg.py135
-rwxr-xr-xsrc/whoosh/classify.py377
-rw-r--r--src/whoosh/codec/__init__.py32
-rw-r--r--src/whoosh/codec/base.py843
-rw-r--r--src/whoosh/codec/memory.py334
-rw-r--r--src/whoosh/codec/plaintext.py452
-rw-r--r--src/whoosh/codec/whoosh3.py1281
-rw-r--r--src/whoosh/collectors.py1162
-rw-r--r--src/whoosh/columns.py1411
-rw-r--r--src/whoosh/compat.py206
-rw-r--r--src/whoosh/externalsort.py240
-rw-r--r--src/whoosh/fields.py1603
-rw-r--r--src/whoosh/filedb/__init__.py0
-rw-r--r--src/whoosh/filedb/compound.py331
-rw-r--r--src/whoosh/filedb/filestore.py655
-rw-r--r--src/whoosh/filedb/filetables.py735
-rw-r--r--src/whoosh/filedb/gae.py164
-rw-r--r--src/whoosh/filedb/structfile.py402
-rw-r--r--src/whoosh/formats.py481
-rw-r--r--src/whoosh/highlight.py952
-rw-r--r--src/whoosh/idsets.py703
-rw-r--r--src/whoosh/index.py707
-rw-r--r--src/whoosh/lang/__init__.py140
-rw-r--r--src/whoosh/lang/dmetaphone.py415
-rw-r--r--src/whoosh/lang/isri.py382
-rw-r--r--src/whoosh/lang/lovins.py570
-rw-r--r--src/whoosh/lang/morph_en.py933
-rw-r--r--src/whoosh/lang/paicehusk.py242
-rw-r--r--src/whoosh/lang/phonetic.py119
-rwxr-xr-xsrc/whoosh/lang/porter.py175
-rw-r--r--src/whoosh/lang/porter2.py313
-rw-r--r--src/whoosh/lang/snowball/__init__.py74
-rw-r--r--src/whoosh/lang/snowball/bases.py133
-rw-r--r--src/whoosh/lang/snowball/danish.py115
-rw-r--r--src/whoosh/lang/snowball/dutch.py173
-rw-r--r--src/whoosh/lang/snowball/english.py465
-rw-r--r--src/whoosh/lang/snowball/finnish.py266
-rw-r--r--src/whoosh/lang/snowball/french.py348
-rw-r--r--src/whoosh/lang/snowball/german.py144
-rw-r--r--src/whoosh/lang/snowball/hungarian.py268
-rw-r--r--src/whoosh/lang/snowball/italian.py230
-rw-r--r--src/whoosh/lang/snowball/norwegian.py84
-rw-r--r--src/whoosh/lang/snowball/portugese.py205
-rw-r--r--src/whoosh/lang/snowball/romanian.py253
-rw-r--r--src/whoosh/lang/snowball/russian.py422
-rw-r--r--src/whoosh/lang/snowball/spanish.py248
-rw-r--r--src/whoosh/lang/snowball/swedish.py80
-rw-r--r--src/whoosh/lang/stopwords.py285
-rw-r--r--src/whoosh/lang/wordnet.py242
-rw-r--r--src/whoosh/legacy.py77
-rw-r--r--src/whoosh/matching/__init__.py31
-rw-r--r--src/whoosh/matching/binary.py803
-rw-r--r--src/whoosh/matching/combo.py312
-rw-r--r--src/whoosh/matching/mcore.py622
-rw-r--r--src/whoosh/matching/wrappers.py572
-rw-r--r--src/whoosh/multiproc.py381
-rw-r--r--src/whoosh/qparser/__init__.py30
-rw-r--r--src/whoosh/qparser/common.py65
-rw-r--r--src/whoosh/qparser/dateparse.py922
-rw-r--r--src/whoosh/qparser/default.py439
-rw-r--r--src/whoosh/qparser/plugins.py1413
-rw-r--r--src/whoosh/qparser/syntax.py641
-rw-r--r--src/whoosh/qparser/taggers.py93
-rw-r--r--src/whoosh/query/__init__.py36
-rw-r--r--src/whoosh/query/compound.py660
-rw-r--r--src/whoosh/query/nested.py412
-rw-r--r--src/whoosh/query/positional.py249
-rw-r--r--src/whoosh/query/qcolumns.py117
-rw-r--r--src/whoosh/query/qcore.py715
-rw-r--r--src/whoosh/query/ranges.py347
-rw-r--r--src/whoosh/query/spans.py872
-rw-r--r--src/whoosh/query/terms.py534
-rw-r--r--src/whoosh/query/wrappers.py198
-rw-r--r--src/whoosh/reading.py1295
-rw-r--r--src/whoosh/scoring.py616
-rw-r--r--src/whoosh/searching.py1658
-rw-r--r--src/whoosh/sorting.py1156
-rw-r--r--src/whoosh/spelling.py343
-rw-r--r--src/whoosh/support/__init__.py0
-rw-r--r--src/whoosh/support/base85.py103
-rw-r--r--src/whoosh/support/bench.py610
-rw-r--r--src/whoosh/support/charset.py1379
-rw-r--r--src/whoosh/support/levenshtein.py70
-rw-r--r--src/whoosh/support/relativedelta.py437
-rw-r--r--src/whoosh/support/unicode.py527
-rw-r--r--src/whoosh/system.py79
-rw-r--r--src/whoosh/util/__init__.py142
-rw-r--r--src/whoosh/util/cache.py375
-rw-r--r--src/whoosh/util/filelock.py163
-rw-r--r--src/whoosh/util/loading.py84
-rw-r--r--src/whoosh/util/numeric.py317
-rw-r--r--src/whoosh/util/numlists.py373
-rw-r--r--src/whoosh/util/testing.py130
-rw-r--r--src/whoosh/util/text.py132
-rw-r--r--src/whoosh/util/times.py467
-rw-r--r--src/whoosh/util/varints.py110
-rw-r--r--src/whoosh/util/versions.py165
-rw-r--r--src/whoosh/writing.py1272
-rw-r--r--tests/test_analysis.py532
-rw-r--r--tests/test_automata.py372
-rw-r--r--tests/test_bits.py185
-rw-r--r--tests/test_classify.py132
-rw-r--r--tests/test_codecs.py621
-rw-r--r--tests/test_collector.py229
-rw-r--r--tests/test_columns.py280
-rw-r--r--tests/test_compound.py65
-rw-r--r--tests/test_dateparse.py356
-rw-r--r--tests/test_fields.py597
-rw-r--r--tests/test_flexible.py104
-rw-r--r--tests/test_highlighting.py282
-rw-r--r--tests/test_indexing.py702
-rw-r--r--tests/test_matching.py556
-rw-r--r--tests/test_misc.py161
-rw-r--r--tests/test_mpwriter.py277
-rw-r--r--tests/test_nested.py361
-rw-r--r--tests/test_parse_plugins.py650
-rw-r--r--tests/test_parsing.py996
-rw-r--r--tests/test_postings.py87
-rw-r--r--tests/test_quality.py172
-rw-r--r--tests/test_queries.py574
-rw-r--r--tests/test_reading.py397
-rw-r--r--tests/test_results.py635
-rw-r--r--tests/test_searching.py1737
-rw-r--r--tests/test_sorting.py1053
-rw-r--r--tests/test_spans.py339
-rw-r--r--tests/test_spelling.py353
-rw-r--r--tests/test_tables.py215
-rw-r--r--tests/test_vectors.py103
-rw-r--r--tests/test_weightings.py81
-rw-r--r--tests/test_writing.py430
238 files changed, 70642 insertions, 0 deletions
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..b026632
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,26 @@
+Copyright 2011 Matt Chaput. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The views and conclusions contained in the software and documentation are
+those of the authors and should not be interpreted as representing official
+policies, either expressed or implied, of Matt Chaput.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..259e54b
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,8 @@
+include *.txt
+include benchmark/dcvgr10.txt.gz
+include benchmark/reuters21578.txt.gz
+recursive-include tests *.txt *.py
+recursive-include benchmark *.txt *.py
+recursive-include docs *.txt *.py *.rst
+recursive-include files *.txt *.py *.png *.jpg *.svg
+
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..84d0d80
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,88 @@
+Metadata-Version: 1.1
+Name: Whoosh
+Version: 2.7.0
+Summary: Fast, pure-Python full text indexing, search, and spell checking library.
+Home-page: http://bitbucket.org/mchaput/whoosh
+Author: Matt Chaput
+Author-email: matt@whoosh.ca
+License: Two-clause BSD license
+Description: About Whoosh
+ ============
+
+ Whoosh is a fast, featureful full-text indexing and searching library
+ implemented in pure Python. Programmers can use it to easily add search
+ functionality to their applications and websites. Every part of how Whoosh
+ works can be extended or replaced to meet your needs exactly.
+
+ Some of Whoosh's features include:
+
+ * Pythonic API.
+ * Pure-Python. No compilation or binary packages needed, no mysterious crashes.
+ * Fielded indexing and search.
+ * Fast indexing and retrieval -- faster than any other pure-Python, scoring,
+ full-text search solution I know of.
+ * Pluggable scoring algorithm (including BM25F), text analysis, storage,
+ posting format, etc.
+ * Powerful query language.
+ * Pure Python spell-checker (as far as I know, the only one).
+
+ Whoosh might be useful in the following circumstances:
+
+ * Anywhere a pure-Python solution is desirable to avoid having to build/compile
+ native libraries (or force users to build/compile them).
+ * As a research platform (at least for programmers that find Python easier to
+ read and work with than Java ;)
+ * When an easy-to-use Pythonic interface is more important to you than raw
+ speed.
+
+ Whoosh was created and is maintained by Matt Chaput. It was originally created
+ for use in the online help system of Side Effects Software's 3D animation
+ software Houdini. Side Effects Software Inc. graciously agreed to open-source
+ the code.
+
+ This software is licensed under the terms of the simplified BSD (A.K.A. "two
+ clause" or "FreeBSD") license. See LICENSE.txt for information.
+
+ Installing Whoosh
+ =================
+
+ If you have ``setuptools`` or ``pip`` installed, you can use ``easy_install``
+ or ``pip`` to download and install Whoosh automatically::
+
+ $ easy_install Whoosh
+
+ or
+
+ $ pip install Whoosh
+
+ Learning more
+ =============
+
+ * Read the online documentation at http://packages.python.org/Whoosh/
+
+ * Join the Whoosh mailing list at http://groups.google.com/group/whoosh
+
+ * File bug reports and view the Whoosh wiki at
+ http://bitbucket.org/mchaput/whoosh/
+
+ Getting the source
+ ==================
+
+ Download source releases from PyPI at http://pypi.python.org/pypi/Whoosh/
+
+ You can check out the latest version of the source code using Mercurial::
+
+ hg clone http://bitbucket.org/mchaput/whoosh
+
+
+Keywords: index search text spell
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 2.5
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing :: Indexing
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..94be5ec
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,68 @@
+About Whoosh
+============
+
+Whoosh is a fast, featureful full-text indexing and searching library
+implemented in pure Python. Programmers can use it to easily add search
+functionality to their applications and websites. Every part of how Whoosh
+works can be extended or replaced to meet your needs exactly.
+
+Some of Whoosh's features include:
+
+* Pythonic API.
+* Pure-Python. No compilation or binary packages needed, no mysterious crashes.
+* Fielded indexing and search.
+* Fast indexing and retrieval -- faster than any other pure-Python, scoring,
+ full-text search solution I know of.
+* Pluggable scoring algorithm (including BM25F), text analysis, storage,
+ posting format, etc.
+* Powerful query language.
+* Pure Python spell-checker (as far as I know, the only one).
+
+Whoosh might be useful in the following circumstances:
+
+* Anywhere a pure-Python solution is desirable to avoid having to build/compile
+ native libraries (or force users to build/compile them).
+* As a research platform (at least for programmers that find Python easier to
+ read and work with than Java ;)
+* When an easy-to-use Pythonic interface is more important to you than raw
+ speed.
+
+Whoosh was created and is maintained by Matt Chaput. It was originally created
+for use in the online help system of Side Effects Software's 3D animation
+software Houdini. Side Effects Software Inc. graciously agreed to open-source
+the code.
+
+This software is licensed under the terms of the simplified BSD (A.K.A. "two
+clause" or "FreeBSD") license. See LICENSE.txt for information.
+
+Installing Whoosh
+=================
+
+If you have ``setuptools`` or ``pip`` installed, you can use ``easy_install``
+or ``pip`` to download and install Whoosh automatically::
+
+ $ easy_install Whoosh
+
+ or
+
+ $ pip install Whoosh
+
+Learning more
+=============
+
+* Read the online documentation at http://packages.python.org/Whoosh/
+
+* Join the Whoosh mailing list at http://groups.google.com/group/whoosh
+
+* File bug reports and view the Whoosh wiki at
+ http://bitbucket.org/mchaput/whoosh/
+
+Getting the source
+==================
+
+Download source releases from PyPI at http://pypi.python.org/pypi/Whoosh/
+
+You can check out the latest version of the source code using Mercurial::
+
+ hg clone http://bitbucket.org/mchaput/whoosh
+
diff --git a/benchmark/dcvgr10.txt.gz b/benchmark/dcvgr10.txt.gz
new file mode 100644
index 0000000..e0e2877
--- /dev/null
+++ b/benchmark/dcvgr10.txt.gz
Binary files differ
diff --git a/benchmark/dictionary.py b/benchmark/dictionary.py
new file mode 100644
index 0000000..77feb48
--- /dev/null
+++ b/benchmark/dictionary.py
@@ -0,0 +1,43 @@
+import os.path, gzip
+
+from whoosh import analysis, fields
+from whoosh.support.bench import Bench, Spec
+
+
+class VulgarTongue(Spec):
+ name = "dictionary"
+ filename = "dcvgr10.txt.gz"
+ headline_field = "head"
+
+ def documents(self):
+ path = os.path.join(self.options.dir, self.filename)
+ f = gzip.GzipFile(path)
+
+ head = body = None
+ for line in f:
+ line = line.decode("latin1")
+ if line[0].isalpha():
+ if head:
+ yield {"head": head, "body": head + body}
+ head, body = line.split(".", 1)
+ else:
+ body += line
+
+ if head:
+ yield {"head": head, "body": head + body}
+
+ def whoosh_schema(self):
+ ana = analysis.StemmingAnalyzer()
+ #ana = analysis.StandardAnalyzer()
+ schema = fields.Schema(head=fields.ID(stored=True),
+ body=fields.TEXT(analyzer=ana, stored=True))
+ return schema
+
+ def zcatalog_setup(self, cat):
+ from zcatalog import indexes #@UnresolvedImport
+ cat["head"] = indexes.FieldIndex(field_name="head")
+ cat["body"] = indexes.TextIndex(field_name="body")
+
+
+if __name__ == "__main__":
+ Bench().run(VulgarTongue)
diff --git a/benchmark/enron.py b/benchmark/enron.py
new file mode 100644
index 0000000..80650c3
--- /dev/null
+++ b/benchmark/enron.py
@@ -0,0 +1,185 @@
+from __future__ import division
+import os.path, tarfile
+from email import message_from_string
+from marshal import dump, load
+from zlib import compress, decompress
+
+try:
+ import xappy
+except ImportError:
+ pass
+
+from whoosh import analysis, fields
+from whoosh.compat import urlretrieve, next
+from whoosh.support.bench import Bench, Spec
+from whoosh.util import now
+
+
+# Benchmark class
+
+class Enron(Spec):
+ name = "enron"
+
+ enron_archive_url = "http://www.cs.cmu.edu/~enron/enron_mail_082109.tar.gz"
+ enron_archive_filename = "enron_mail_082109.tar.gz"
+ cache_filename = "enron_cache.pickle"
+
+ header_to_field = {"Date": "date", "From": "frm", "To": "to",
+ "Subject": "subject", "Cc": "cc", "Bcc": "bcc"}
+
+ main_field = "body"
+ headline_field = "subject"
+
+ field_order = ("subject", "date", "from", "to", "cc", "bcc", "body")
+
+ cachefile = None
+
+ # Functions for downloading and then reading the email archive and caching
+ # the messages in an easier-to-digest format
+
+ def download_archive(self, archive):
+ print("Downloading Enron email archive to %r..." % archive)
+ t = now()
+ urlretrieve(self.enron_archive_url, archive)
+ print("Downloaded in ", now() - t, "seconds")
+
+ @staticmethod
+ def get_texts(archive):
+ archive = tarfile.open(archive, "r:gz")
+ while True:
+ entry = next(archive)
+ archive.members = []
+ if entry is None:
+ break
+ f = archive.extractfile(entry)
+ if f is not None:
+ text = f.read()
+ yield text
+
+ @staticmethod
+ def get_messages(archive, headers=True):
+ header_to_field = Enron.header_to_field
+ for text in Enron.get_texts(archive):
+ message = message_from_string(text)
+ body = message.as_string().decode("latin_1")
+ blank = body.find("\n\n")
+ if blank > -1:
+ body = body[blank+2:]
+ d = {"body": body}
+ if headers:
+ for k in message.keys():
+ fn = header_to_field.get(k)
+ if not fn: continue
+ v = message.get(k).strip()
+ if v:
+ d[fn] = v.decode("latin_1")
+ yield d
+
+ def cache_messages(self, archive, cache):
+ print("Caching messages in %s..." % cache)
+
+ if not os.path.exists(archive):
+ raise Exception("Archive file %r does not exist" % archive)
+
+ t = now()
+ f = open(cache, "wb")
+ c = 0
+ for d in self.get_messages(archive):
+ c += 1
+ dump(d, f)
+ if not c % 1000: print(c)
+ f.close()
+ print("Cached messages in ", now() - t, "seconds")
+
+ def setup(self):
+ archive = os.path.abspath(os.path.join(self.options.dir, self.enron_archive_filename))
+ cache = os.path.abspath(os.path.join(self.options.dir, self.cache_filename))
+
+ if not os.path.exists(archive):
+ self.download_archive(archive)
+ else:
+ print("Archive is OK")
+
+ if not os.path.exists(cache):
+ self.cache_messages(archive, cache)
+ else:
+ print("Cache is OK")
+
+ def documents(self):
+ if not os.path.exists(self.cache_filename):
+ raise Exception("Message cache does not exist, use --setup")
+
+ f = open(self.cache_filename, "rb")
+ try:
+ while True:
+ self.filepos = f.tell()
+ d = load(f)
+ yield d
+ except EOFError:
+ pass
+ f.close()
+
+ def whoosh_schema(self):
+ ana = analysis.StemmingAnalyzer(maxsize=40, cachesize=None)
+ storebody = self.options.storebody
+ schema = fields.Schema(body=fields.TEXT(analyzer=ana, stored=storebody),
+ filepos=fields.STORED,
+ date=fields.ID(stored=True),
+ frm=fields.ID(stored=True),
+ to=fields.IDLIST(stored=True),
+ subject=fields.TEXT(stored=True),
+ cc=fields.IDLIST,
+ bcc=fields.IDLIST)
+ return schema
+
+ def xappy_indexer_connection(self, path):
+ conn = xappy.IndexerConnection(path)
+ conn.add_field_action('body', xappy.FieldActions.INDEX_FREETEXT, language='en')
+ if self.options.storebody:
+ conn.add_field_action('body', xappy.FieldActions.STORE_CONTENT)
+ conn.add_field_action('date', xappy.FieldActions.INDEX_EXACT)
+ conn.add_field_action('date', xappy.FieldActions.STORE_CONTENT)
+ conn.add_field_action('frm', xappy.FieldActions.INDEX_EXACT)
+ conn.add_field_action('frm', xappy.FieldActions.STORE_CONTENT)
+ conn.add_field_action('to', xappy.FieldActions.INDEX_EXACT)
+ conn.add_field_action('to', xappy.FieldActions.STORE_CONTENT)
+ conn.add_field_action('subject', xappy.FieldActions.INDEX_FREETEXT, language='en')
+ conn.add_field_action('subject', xappy.FieldActions.STORE_CONTENT)
+ conn.add_field_action('cc', xappy.FieldActions.INDEX_EXACT)
+ conn.add_field_action('bcc', xappy.FieldActions.INDEX_EXACT)
+ return conn
+
+ def zcatalog_setup(self, cat):
+ from zcatalog import indexes
+ for name in ("date", "frm"):
+ cat[name] = indexes.FieldIndex(field_name=name)
+ for name in ("to", "subject", "cc", "bcc", "body"):
+ cat[name] = indexes.TextIndex(field_name=name)
+
+ def process_document_whoosh(self, d):
+ d["filepos"] = self.filepos
+ if self.options.storebody:
+ mf = self.main_field
+ d["_stored_%s" % mf] = compress(d[mf], 9)
+
+ def process_result_whoosh(self, d):
+ mf = self.main_field
+ if mf in d:
+ d.fields()[mf] = decompress(d[mf])
+ else:
+ if not self.cachefile:
+ self.cachefile = open(self.cache_filename, "rb")
+ filepos = d["filepos"]
+ self.cachefile.seek(filepos)
+ dd = load(self.cachefile)
+ d.fields()[mf] = dd[mf]
+ return d
+
+ def process_document_xapian(self, d):
+ d[self.main_field] = " ".join([d.get(name, "") for name
+ in self.field_order])
+
+
+
+if __name__=="__main__":
+ Bench().run(Enron)
diff --git a/benchmark/marc21.py b/benchmark/marc21.py
new file mode 100644
index 0000000..9a2bb9b
--- /dev/null
+++ b/benchmark/marc21.py
@@ -0,0 +1,297 @@
+from __future__ import with_statement, print_function
+import fnmatch, logging, os.path, re
+
+from whoosh import analysis, fields, index, qparser, query, scoring
+from whoosh.compat import xrange
+from whoosh.util import now
+
+
+log = logging.getLogger(__name__)
+
+
+# Functions for reading MARC format
+
+LEADER = (' ' * 10) + '22' + (' ' * 8) + '4500'
+LEADER_LEN = len(LEADER)
+DIRECTORY_ENTRY_LEN = 12
+SUBFIELD_INDICATOR = "\x1F"
+END_OF_FIELD = "\x1E"
+END_OF_RECORD = "\x1D"
+isbn_regex = re.compile(r'[-0-9xX]+')
+
+
+def read_file(dbfile, tags=None):
+ while True:
+ pos = dbfile.tell()
+ first5 = dbfile.read(5)
+ if not first5:
+ return
+ if len(first5) < 5:
+ raise Exception
+ length = int(first5)
+ chunk = dbfile.read(length - 5)
+ yield parse_record(first5 + chunk, tags), pos
+
+
+def read_record(filename, pos, tags=None):
+ f = open(filename, "rb")
+ f.seek(pos)
+ first5 = f.read(5)
+ length = int(first5)
+ chunk = f.read(length - 5)
+ return parse_record(first5 + chunk, tags)
+
+
+def parse_record(data, tags=None):
+ leader = data[:LEADER_LEN]
+ assert len(leader) == LEADER_LEN
+
+ dataoffset = int(data[12:17])
+ assert dataoffset > 0
+ assert dataoffset < len(data)
+
+ # dataoffset - 1 to avoid END-OF-FIELD byte
+ dirstart = LEADER_LEN
+ dirend = dataoffset - 1
+
+ # Number of fields in record
+ assert (dirend - dirstart) % DIRECTORY_ENTRY_LEN == 0
+ field_count = (dirend - dirstart) // DIRECTORY_ENTRY_LEN
+
+ result = {}
+ for i in xrange(field_count):
+ start = dirstart + i * DIRECTORY_ENTRY_LEN
+ end = start + DIRECTORY_ENTRY_LEN
+ tag = data[start:start + 3]
+ if tags and not tag in tags:
+ continue
+
+ entry = data[start:end]
+ elen = int(entry[3:7])
+ offset = dataoffset + int(entry[7:12])
+ edata = data[offset:offset + elen - 1]
+
+ if not (tag < "010" and tag.isdigit()):
+ edata = edata.split(SUBFIELD_INDICATOR)[1:]
+ if tag in result:
+ result[tag].extend(edata)
+ else:
+ result[tag] = edata
+ else:
+ result[tag] = edata
+ return result
+
+
+def subfield(vs, code):
+ for v in vs:
+ if v.startswith(code):
+ return v[1:]
+ return None
+
+
+def joinsubfields(vs):
+ return " ".join(v[1:] for v in vs if v and v[0] != "6")
+
+
+def getfields(d, *tags):
+ return (d[tag] for tag in tags if tag in d)
+
+
+def title(d):
+ title = None
+ if "245" in d:
+ svs = d["245"]
+ title = subfield(svs, "a")
+ if title:
+ t2 = subfield(svs, "b")
+ if t2:
+ title += t2
+ return title
+
+
+def isbn(d):
+ if "020" in d:
+ num = subfield(d["020"], "a")
+ if num:
+ match = isbn_regex.search(num)
+ if match:
+ return match.group(0).replace('-', '')
+
+
+def author(d):
+ if "100" in d:
+ return joinsubfields(d["100"])
+ elif "110" in d:
+ return joinsubfields(d["110"])
+ elif "111" in d:
+ return joinsubfields(d["111"])
+
+
+def uniform_title(d):
+ if "130" in d:
+ return joinsubfields(d["130"])
+ elif "240" in d:
+ return joinsubfields(d["240"])
+
+
+subjectfields = ("600 610 611 630 648 650 651 653 654 655 656 657 658 662 "
+ "690 691 696 697 698 699").split()
+
+
+def subjects(d):
+ return " ".join(joinsubfields(vs) for vs in getfields(d, *subjectfields))
+
+
+def physical(d):
+ return joinsubfields(d["300"])
+
+
+def location(d):
+ return joinsubfields(d["852"])
+
+
+def publisher(d):
+ if "260" in d:
+ return subfield(d["260"], "b")
+
+
+def pubyear(d):
+ if "260" in d:
+ return subfield(d["260"], "c")
+
+
+def uni(v):
+ return u"" if v is None else v.decode("utf-8", "replace")
+
+
+# Indexing and searching
+
+def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True,
+ glob="*.mrc"):
+ if not os.path.exists(ixdir):
+ os.mkdir(ixdir)
+
+ # Multi-lingual stop words
+ stoplist = (analysis.STOP_WORDS
+ | set("de la der und le die et en al no von di du da "
+ "del zur ein".split()))
+ # Schema
+ ana = analysis.StemmingAnalyzer(stoplist=stoplist)
+ schema = fields.Schema(title=fields.TEXT(analyzer=ana),
+ author=fields.TEXT(phrase=False),
+ subject=fields.TEXT(analyzer=ana, phrase=False),
+ file=fields.STORED, pos=fields.STORED,
+ )
+
+ # MARC fields to extract
+ mfields = set(subjectfields) # Subjects
+ mfields.update("100 110 111".split()) # Author
+ mfields.add("245") # Title
+
+ print("Indexing with %d processor(s) and %d MB per processor"
+ % (procs, limitmb))
+ c = 0
+ t = now()
+ ix = index.create_in(ixdir, schema)
+ with ix.writer(procs=procs, limitmb=limitmb,
+ multisegment=multisegment) as w:
+ filenames = [filename for filename in os.listdir(basedir)
+ if fnmatch.fnmatch(filename, glob)]
+ for filename in filenames:
+ path = os.path.join(basedir, filename)
+ print("Indexing", path)
+ f = open(path, 'rb')
+ for x, pos in read_file(f, mfields):
+ w.add_document(title=uni(title(x)), author=uni(author(x)),
+ subject=uni(subjects(x)),
+ file=filename, pos=pos)
+ c += 1
+ f.close()
+ print("Committing...")
+ print("Indexed %d records in %0.02f minutes" % (c, (now() - t) / 60.0))
+
+
+def print_record(no, basedir, filename, pos):
+ path = os.path.join(basedir, filename)
+ record = read_record(path, pos)
+ print("% 5d. %s" % (no + 1, title(record)))
+ print(" ", author(record))
+ print(" ", subjects(record))
+ isbn_num = isbn(record)
+ if isbn_num:
+ print(" ISBN:", isbn_num)
+ print()
+
+
+def search(qstring, ixdir, basedir, limit=None, optimize=True, scores=True):
+ ix = index.open_dir(ixdir)
+ qp = qparser.QueryParser("title", ix.schema)
+ q = qp.parse(qstring)
+
+ with ix.searcher(weighting=scoring.PL2()) as s:
+ if scores:
+ r = s.search(q, limit=limit, optimize=optimize)
+ for hit in r:
+ print_record(hit.rank, basedir, hit["file"], hit["pos"])
+ print("Found %d records in %0.06f seconds" % (len(r), r.runtime))
+ else:
+ t = now()
+ for i, docnum in enumerate(s.docs_for_query(q)):
+ if not limit or i < limit:
+ fields = s.stored_fields(docnum)
+ print_record(i, basedir, fields["file"], fields["pos"])
+ print("Found %d records in %0.06f seconds" % (i, now() - t))
+
+
+if __name__ == "__main__":
+ from optparse import OptionParser
+
+ p = OptionParser(usage="usage: %prog [options] query")
+ # Common options
+ p.add_option("-f", "--filedir", metavar="DIR", dest="basedir",
+ help="Directory containing the .mrc files to index",
+ default="data/HLOM")
+ p.add_option("-d", "--dir", metavar="DIR", dest="ixdir",
+ help="Directory containing the index", default="marc_index")
+
+ # Indexing options
+ p.add_option("-i", "--index", dest="index",
+ help="Index the records", action="store_true", default=False)
+ p.add_option("-p", "--procs", metavar="NPROCS", dest="procs",
+ help="Number of processors to use", default="1")
+ p.add_option("-m", "--mb", metavar="MB", dest="limitmb",
+ help="Limit the indexer to this many MB of memory per writer",
+ default="128")
+ p.add_option("-M", "--merge-segments", dest="multisegment",
+ help="If indexing with multiproc, merge the segments after"
+ " indexing", action="store_false", default=True)
+ p.add_option("-g", "--match", metavar="GLOB", dest="glob",
+ help="Only index file names matching the given pattern",
+ default="*.mrc")
+
+ # Search options
+ p.add_option("-l", "--limit", metavar="NHITS", dest="limit",
+ help="Maximum number of search results to print (0=no limit)",
+ default="10")
+ p.add_option("-O", "--no-optimize", dest="optimize",
+ help="Turn off searcher optimization (for debugging)",
+ action="store_false", default=True)
+ p.add_option("-s", "--scoring", dest="scores",
+ help="Score the results", action="store_true", default=False)
+
+ options, args = p.parse_args()
+
+ if options.index:
+ make_index(options.basedir, options.ixdir,
+ procs=int(options.procs),
+ limitmb=int(options.limitmb),
+ multisegment=options.multisegment,
+ glob=options.glob)
+
+ if args:
+ qstring = " ".join(args).decode("utf-8")
+ limit = int(options.limit)
+ if limit < 1:
+ limit = None
+ search(qstring, options.ixdir, options.basedir, limit=limit,
+ optimize=options.optimize, scores=options.scores)
diff --git a/benchmark/reuters.py b/benchmark/reuters.py
new file mode 100644
index 0000000..aa20c74
--- /dev/null
+++ b/benchmark/reuters.py
@@ -0,0 +1,38 @@
+import gzip, os.path
+
+from whoosh import analysis, fields, index, qparser, query
+from whoosh.support.bench import Bench, Spec
+from whoosh.util import now
+
+
+class Reuters(Spec):
+ name = "reuters"
+ filename = "reuters21578.txt.gz"
+ main_field = "text"
+ headline_text = "headline"
+
+ def whoosh_schema(self):
+ #ana = analysis.StemmingAnalyzer()
+ ana = analysis.StandardAnalyzer()
+ schema = fields.Schema(id=fields.ID(stored=True),
+ headline=fields.STORED,
+ text=fields.TEXT(analyzer=ana, stored=True))
+ return schema
+
+ def zcatalog_setup(self, cat):
+ from zcatalog import indexes #@UnresolvedImport
+ cat["id"] = indexes.FieldIndex(field_name="id")
+ cat["headline"] = indexes.TextIndex(field_name="headline")
+ cat["body"] = indexes.TextIndex(field_name="text")
+
+ def documents(self):
+ path = os.path.join(self.options.dir, self.filename)
+ f = gzip.GzipFile(path)
+
+ for line in f:
+ id, text = line.decode("latin1").split("\t")
+ yield {"id": id, "text": text, "headline": text[:70]}
+
+
+if __name__ == "__main__":
+ Bench().run(Reuters)
diff --git a/benchmark/reuters21578.txt.gz b/benchmark/reuters21578.txt.gz
new file mode 100644
index 0000000..cdf0677
--- /dev/null
+++ b/benchmark/reuters21578.txt.gz
Binary files differ
diff --git a/debian/NEWS b/debian/NEWS
new file mode 100644
index 0000000..cb020ae
--- /dev/null
+++ b/debian/NEWS
@@ -0,0 +1,8 @@
+python-whoosh (0.1.22-1) unstable; urgency=low
+
+ When upgrading from earlier python-whoosh versions, you will have to
+ reindex all data, as the index format has changed. How this has to
+ happen is application-specific.
+
+ -- Daniel Watkins <daniel.watkins@credativ.co.uk> Sat, 06 Jun 2009 13:35:15 +0100
+
diff --git a/debian/README.source b/debian/README.source
new file mode 100644
index 0000000..5dde0bf
--- /dev/null
+++ b/debian/README.source
@@ -0,0 +1,58 @@
+This package uses quilt to manage all modifications to the upstream
+source. Changes are stored in the source package as diffs in
+debian/patches and applied during the build.
+
+To configure quilt to use debian/patches instead of patches, you want
+either to export QUILT_PATCHES=debian/patches in your environment
+or use this snippet in your ~/.quiltrc:
+
+ for where in ./ ../ ../../ ../../../ ../../../../ ../../../../../; do
+ if [ -e ${where}debian/rules -a -d ${where}debian/patches ]; then
+ export QUILT_PATCHES=debian/patches
+ break
+ fi
+ done
+
+To get the fully patched source after unpacking the source package, cd to
+the root level of the source package and run:
+
+ quilt push -a
+
+The last patch listed in debian/patches/series will become the current
+patch.
+
+To add a new set of changes, first run quilt push -a, and then run:
+
+ quilt new <patch>
+
+where <patch> is a descriptive name for the patch, used as the filename in
+debian/patches. Then, for every file that will be modified by this patch,
+run:
+
+ quilt add <file>
+
+before editing those files. You must tell quilt with quilt add what files
+will be part of the patch before making changes or quilt will not work
+properly. After editing the files, run:
+
+ quilt refresh
+
+to save the results as a patch.
+
+Alternately, if you already have an external patch and you just want to
+add it to the build system, run quilt push -a and then:
+
+ quilt import -P <patch> /path/to/patch
+ quilt push -a
+
+(add -p 0 to quilt import if needed). <patch> as above is the filename to
+use in debian/patches. The last quilt push -a will apply the patch to
+make sure it works properly.
+
+To remove an existing patch from the list of patches that will be applied,
+run:
+
+ quilt delete <patch>
+
+You may need to run quilt pop -a to unapply patches first before running
+this command.
diff --git a/debian/changelog b/debian/changelog
new file mode 100644
index 0000000..9b44d70
--- /dev/null
+++ b/debian/changelog
@@ -0,0 +1,391 @@
+python-whoosh (2.7.0-1) unstable; urgency=medium
+
+ * New upstream release.
+ * Update watch file.
+ Thanks to Piotr Ożarowski
+ * debian/copyright: Update copyright years.
+ * debian/upstream/metadata: Added upstream metadata.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@users.sourceforge.net> Thu, 07 May 2015 13:22:20 +0200
+
+python-whoosh (2.5.7-3) unstable; urgency=medium
+
+ [ أحمد المحمودي (Ahmed El-Mahmoudy) ]
+ * Update my email address.
+ * debian/control: Bumped Standards-Version to 3.9.6
+
+ [ Jean-Michel Nirgal Vourgère ]
+ * Change python-whoosh.maintscript into python-whoosh-doc.maintscript
+ /usr/share/doc/python-whoosh-doc was a link to python-whoosh. Fixed
+ 'prior-version' as the current one, see dpkg-maintscript-helper(1). Drop
+ optional 'package' since where are not using maintscript but
+ python-whoosh-doc.maintscript. Drop unused Pre-Depends on dpkg with support
+ of symlink_to_dir, added missing Pre-Depends on misc:Pre-Depends in
+ python-whoosh-doc (Closes: #768275)
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@users.sourceforge.net> Sun, 16 Nov 2014 12:16:23 +0200
+
+python-whoosh (2.5.7-2) unstable; urgency=medium
+
+ [ Zygmunt Krynicki ]
+ * debian/rules: convert to pybuild, simplify all rules
+ * debian/control: add support for python3, depend on dh-python
+ (Closes: #647439)
+ * debian/python-whoosh.install: remove (not needed anymore)
+ * debian/control: build-depend on python3-sphinx for documentation
+
+ [ أحمد المحمودي (Ahmed El-Mahmoudy) ]
+ * Moved packaging to collab-maint.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Tue, 22 Jul 2014 11:53:36 +0200
+
+python-whoosh (2.5.7-1) unstable; urgency=low
+
+ * New upstream release.
+ * debian/copyright: Update copyright years.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Sat, 22 Feb 2014 10:06:33 +0200
+
+python-whoosh (2.5.6-3) unstable; urgency=low
+
+ * Added debian/python-whoosh.maintscript to switch
+ /usr/share/doc/python-whoosh symlink to a real directory
+ (Closes: #736299)
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Sat, 25 Jan 2014 11:53:46 +0200
+
+python-whoosh (2.5.6-2) unstable; urgency=low
+
+ * Remove override for dh_fixperms, seems to be no more needed.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Mon, 23 Dec 2013 22:51:23 +0200
+
+python-whoosh (2.5.6-1) unstable; urgency=low
+
+ * New upstream release.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Sat, 21 Dec 2013 16:05:11 +0200
+
+python-whoosh (2.5.5-1) unstable; urgency=low
+
+ * New upstream release.
+ * debian/control: Bumped Standards-Version to 3.9.5
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Thu, 14 Nov 2013 13:36:18 +0200
+
+python-whoosh (2.5.4-1) unstable; urgency=low
+
+ * New upstream release.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Mon, 23 Sep 2013 21:44:53 +0200
+
+python-whoosh (2.5.3-1) unstable; urgency=low
+
+ * New upstream release.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Fri, 13 Sep 2013 04:39:56 +0200
+
+python-whoosh (2.5.2-1) unstable; urgency=low
+
+ * New upstream release.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Sat, 17 Aug 2013 18:20:03 +0200
+
+python-whoosh (2.5.1-1) unstable; urgency=low
+
+ * New upstream release.
+ * debian/control:
+ + Remove Daniel Watkins from Uploaders field, since he seems to be MIA
+ (Closes: #705280)
+ + Bumped Standards-Version to 3.9.4
+ + Use canonical URIs in Vcs-* fields
+ + Remove obsolete DMUA fields
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Fri, 05 Jul 2013 00:19:56 +0200
+
+python-whoosh (2.4.1-1) unstable; urgency=low
+
+ * New upstream release.
+ * Removed test_final_ranges_thisyear.diff: fixed usptream.
+ * debian/control: Updated Standards-Version to 3.9.3
+ * Bumped compat level to 9
+ * debian/copyright: Updated copyright format & years.
+ * Un-link python-whoosh-doc documentation directory from python-whoosh
+ documentation directory:
+ + debian/rules: remove override for dh_installdocs
+ + Update python-whoosh-doc.doc-base
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Thu, 26 Jul 2012 13:45:44 +0200
+
+python-whoosh (2.3.2-2) unstable; urgency=low
+
+ * Added test_final_ranges_thisyear.diff to fix the "oct 2010 to feb" date
+ range test (Closes: #655641)
+ * debian/control: Updated upstream URL
+ * debian/copyright:
+ + Updated copyright years
+ + Updated upstream URL
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Fri, 13 Jan 2012 11:32:18 +0200
+
+python-whoosh (2.3.2-1) unstable; urgency=low
+
+ * New upstream release.
+ * Build-Dep on python-sphinx (>= 1.0.7+dfsg)
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Fri, 16 Dec 2011 15:54:19 +0200
+
+python-whoosh (2.3.0-1) unstable; urgency=low
+
+ * New upstream release.
+ * Updated copyright format & info
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Sat, 08 Oct 2011 16:39:48 +0200
+
+python-whoosh (2.2.2-1) unstable; urgency=low
+
+ * New upstream release. (Closes: #638765)
+ * Moved packaging to Git.
+ + debian/control: Remove DPMT from package maintainership due to the move
+ to Git. Put myself as maintainer instead
+ * Split documentation into python-whoosh-doc package
+ + debian/control:
+ - Added python-whoosh-doc package
+ - Added Suggests: python-whoosh-doc for python-whoosh
+ + debian/rules: Symlink python-whoosh-doc documentation directory to
+ python-whoosh documentation directory.
+ + Renamed debian/python-whoosh.{docs,doc-base} to
+ debian/python-whoosh-doc.{docs,doc-base}
+ + Added debian/python-whoosh.install
+ * debian/control: XS-Python-Version -> X-Python-Version
+ * Use sphinxdoc debhelper instead of managing symlinks myself
+ + debian/control: Replace libjs-jquery with ${sphinxdoc:Depends}
+ + debian/rules: Add sphinxdoc debhelper sequence to dh call
+ + Removed debian/python-whoosh.links
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Fri, 26 Aug 2011 10:26:44 +0200
+
+python-whoosh (1.8.4-1) unstable; urgency=low
+
+ * New upstream release.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Sat, 04 Jun 2011 19:44:07 +0200
+
+python-whoosh (1.8.2-1) unstable; urgency=low
+
+ * New upstream release.
+ * debian/control: Bumped Standards-Version to 3.9.2
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Tue, 19 Apr 2011 08:59:07 +0200
+
+python-whoosh (1.8.1-1) unstable; urgency=low
+
+ * New upstream release
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Mon, 04 Apr 2011 20:21:30 +0200
+
+python-whoosh (1.8.0-1) unstable; urgency=low
+
+ * New upstream release
+ * Remove fix_test_combine.diff patch, as it is applied upstream.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Thu, 24 Mar 2011 10:50:58 +0200
+
+python-whoosh (1.7.8-1) unstable; urgency=low
+
+ * New upstream release.
+ * Removed use_nose.diff & shm_check.diff as they are applied upstream.
+ * Added fix_test_combine.diff patch from upstream to fix test_combine test.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Sat, 19 Mar 2011 10:15:14 +0200
+
+python-whoosh (1.7.6-1) unstable; urgency=low
+
+ * New upstream release.
+ * debian/control: added python-nose to B-D-I
+ * Added use_nose.diff patch from upstream to switch setuptools "test_suite"
+ key to use Nose integration.
+ * Refreshed shm_check.diff patch.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Sat, 26 Feb 2011 23:57:15 +0200
+
+python-whoosh (1.7.4-1) unstable; urgency=low
+
+ * New upstream release.
+ * Removed fix_methodcaller_import.diff and fix_test_colonspace.diff patches,
+ as they are applied upstream.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Tue, 22 Feb 2011 21:30:12 +0200
+
+python-whoosh (1.7.2-1) unstable; urgency=low
+
+ * New upstream release.
+ * Added fix_methodcaller_import.diff patch which fixes import of
+ methodcaller, which was only added in Python 2.6.
+ * Added fix_test_colonspace.diff patch which fixes test_colonspace failure.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Fri, 18 Feb 2011 10:02:16 +0200
+
+python-whoosh (1.4.1-1) unstable; urgency=low
+
+ * New upstream release.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Fri, 10 Dec 2010 10:54:37 +0200
+
+python-whoosh (1.2.6-2) unstable; urgency=low
+
+ * debian/patches/shm_check.diff: try importing multiprocessing.synchronize
+ to check for ImportError, this is to avoid FTBFS against python 2.7
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Thu, 11 Nov 2010 12:46:49 +0200
+
+python-whoosh (1.2.6-1) unstable; urgency=low
+
+ * New upstream release
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Fri, 05 Nov 2010 09:53:55 +0200
+
+python-whoosh (1.2.5-1) experimental; urgency=low
+
+ * New upstream release.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Sat, 30 Oct 2010 05:54:31 +0200
+
+python-whoosh (1.2.3-1) experimental; urgency=low
+
+ * New upstream release.
+ * Refreshed shm_check.diff patch.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Fri, 29 Oct 2010 08:07:24 +0200
+
+python-whoosh (1.1.0-1) experimental; urgency=low
+
+ * New upstream release
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Fri, 22 Oct 2010 14:30:44 +0200
+
+python-whoosh (1.0.0-1) experimental; urgency=low
+
+ [ Jakub Wilk ]
+ * Use ‘chmod a-x’ rather than ‘chmod -x’ in debian/rules.
+ * Don't ignore errors while running tests.
+ * Respect the ‘nocheck’ build option.
+ * Remove embedded copies of pyparsing for all Python versions.
+
+ [ أحمد المحمودي (Ahmed El-Mahmoudy) ]
+ * New upstream release
+ * Bumped compat level to 8.
+ * debian/control:
+ + Updated my email address.
+ + Bumped Standards-Version to 3.9.1 (no changes needed)
+ + Dropped python-pyparsing from Depends & Build-Deps, since it is not used
+ anymore.
+ + Add XS-Python-Version field.
+ + Drop python-support from Build-Deps
+ + Bumped python-all Build-Dep to (>= 2.6.6-2)
+ + Added Breaks: ${python:Breaks}, to avoid getting
+ python (<= <UNSUPPORTED VERSION>) in Depends.
+ * debian/rules:
+ + added --with python2 to dh call.
+ + Removed override for dh_pysupport, not needed anymore.
+ + Override dh_auto_clean to remove docs/build
+ * debian/copyright: updated copyrights.
+ * Dropped 01-remove-pyparsing.diff patch, as it is no more needed.
+ * Added shm_check.diff patch to check if semaphore locking works, since
+ /dev/shm is not mounted as tmpfs in build chroots.
+ * Removed debian/pyversions
+
+ [ Bernd Zeimetz ]
+ * Adding DM-Upload-Allowed: yes for أحمد المحمودي.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@sabily.org> Thu, 07 Oct 2010 20:26:14 +0200
+
+python-whoosh (0.3.18-1) unstable; urgency=low
+
+ * New upstream release.
+ * debian/control:
+ + Added python-sphinx to Build-Depends-Indep to build documentation.
+ + Added libjs-jquery to Depends.
+ + Added python-all to Build-Depends-Indep.
+ * debian/rules:
+ + Override dh_auto_test to run test suite.
+ + Override dh_auto_build to also build documentation.
+ + Override dh_compress to avoid compressing Whoosh documentation files.
+ + Override dh_installdocs to avoid installing convenience copy of
+ jquery.js. Instead, it is symlinked from the libjs-jquery package.
+ + Logic added for fixing permission of PKG-INFO file such that it would
+ work for both Debian & Ubuntu.
+ * Added debian/python-whoosh.docs, debian/python-whoosh.links,
+ debian/python-whoosh.doc-base
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@users.sourceforge.net> Thu, 25 Feb 2010 08:49:55 +0200
+
+python-whoosh (0.3.16-1) unstable; urgency=low
+
+ * New upstream release.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@users.sourceforge.net> Mon, 15 Feb 2010 11:51:28 +0200
+
+python-whoosh (0.3.15-1) unstable; urgency=low
+
+ [ أحمد المحمودي (Ahmed El-Mahmoudy) ]
+ * New upstream release.
+ * debian/rules: Override dh_fixperms to remove executable bit from files in
+ /usr/share/pyshared/*.egg-info/
+ * Refresh 01-remove-pyparsing.diff patch.
+ * Switched to 3.0 (quilt) source format.
+ * debian/control:
+ + Bumped Standards-Version to 3.8.4
+ + Added myself to uploaders
+
+ [ Bernd Zeimetz ]
+ * Switch Uploaders and Maintainers in debian/control.
+
+ -- أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@users.sourceforge.net> Sun, 31 Jan 2010 13:51:02 +0200
+
+python-whoosh (0.3.2-1) unstable; urgency=low
+
+ [ Bernd Zeimetz ]
+ * New upstream release.
+ * Fix watch file to avoid cluttered versions.
+ * Bump versions in build-deps to ensure that dh overrides and
+ --with-quilt works
+ * Drop tests part from debian/rules completely, there is no test
+ in the source anymore.
+ * Add debian/README.source.
+ * Bump Standards-Version to 3.8.3, no changes needed.
+
+ -- Debian Python Modules Team <python-modules-team@lists.alioth.debian.org> Thu, 05 Nov 2009 11:09:36 +0100
+
+python-whoosh (0.3.~0b24-1) experimental; urgency=low
+
+ [ Bernd Zeimetz ]
+ * New upstream release.
+ * Whoosh is not compatible with 2.4 thanks to pickling problems,
+ drop compat patch and limit versions in debian/pyversions.
+ * Update watch file.
+ * Disable test for now as they're broken/not existant.
+
+ -- Debian Python Modules Team <python-modules-team@lists.alioth.debian.org> Fri, 02 Oct 2009 16:51:08 +0200
+
+python-whoosh (0.1.22-1) unstable; urgency=low
+
+ * New upstream release.
+ * Changed Maintainer to my credativ email address.
+ * Refreshed debian/patches/02-python2.4-fixes.diff.
+
+ -- Daniel Watkins <daniel.watkins@credativ.co.uk> Sat, 06 Jun 2009 13:38:47 +0100
+
+python-whoosh (0.1.19-2) unstable; urgency=low
+
+ * Add debian/patches/01-remove-pyparsing.diff to remove the upstream copy of
+ the python-pyparsing library.
+ * Add python-pyparsing to Build-Depends-Indep
+
+ -- Daniel Watkins <daniel@daniel-watkins.co.uk> Thu, 07 May 2009 16:44:18 +0100
+
+python-whoosh (0.1.19-1) unstable; urgency=low
+
+ * Initial release. (Closes: #522934)
+
+ -- Daniel Watkins <daniel@daniel-watkins.co.uk> Fri, 01 May 2009 12:27:22 +0100
diff --git a/debian/compat b/debian/compat
new file mode 100644
index 0000000..ec63514
--- /dev/null
+++ b/debian/compat
@@ -0,0 +1 @@
+9
diff --git a/debian/control b/debian/control
new file mode 100644
index 0000000..e7b736a
--- /dev/null
+++ b/debian/control
@@ -0,0 +1,60 @@
+Source: python-whoosh
+Section: python
+Priority: optional
+Maintainer: أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@users.sourceforge.net>
+Build-Depends: debhelper (>= 9), dh-python, python-setuptools, python3-setuptools
+Build-Depends-Indep: python3-sphinx (>= 1.0.7+dfsg), python-all (>= 2.6.6-2), python3-all, python-pytest, python3-pytest
+Standards-Version: 3.9.6
+Homepage: http://bitbucket.org/mchaput/whoosh/
+X-Python-Version: >= 2.5
+X-Python3-Version: >= 3.2
+Vcs-Git: git://anonscm.debian.org/collab-maint/python-whoosh.git
+Vcs-Browser: http://anonscm.debian.org/gitweb/?p=collab-maint/python-whoosh.git
+
+Package: python-whoosh
+Architecture: all
+Depends: ${python:Depends}, ${misc:Depends}
+Suggests: python-whoosh-doc
+Description: pure-Python full-text indexing, search, and spell checking library (Python 2)
+ Whoosh is a fast, pure-Python indexing and search library. Programmers
+ can use it to easily add search functionality to their applications and
+ websites. As Whoosh is pure Python, you don't have to compile or
+ install a binary support library and/or make Python work with a JVM, yet
+ indexing and searching is still very fast. Whoosh is designed to be
+ modular, so every part can be extended or replaced to meet your needs
+ exactly.
+ .
+ This package contains the python2 library
+
+Package: python3-whoosh
+Architecture: all
+Depends: ${python3:Depends}, ${misc:Depends}
+Suggests: python-whoosh-doc
+Description: pure-Python full-text indexing, search, and spell checking library (Python 3)
+ Whoosh is a fast, pure-Python indexing and search library. Programmers
+ can use it to easily add search functionality to their applications and
+ websites. As Whoosh is pure Python, you don't have to compile or
+ install a binary support library and/or make Python work with a JVM, yet
+ indexing and searching is still very fast. Whoosh is designed to be
+ modular, so every part can be extended or replaced to meet your needs
+ exactly.
+ .
+ This package contains the python3 library
+
+Package: python-whoosh-doc
+Architecture: all
+Section: doc
+Priority: extra
+Pre-Depends: ${misc:Pre-Depends}
+Depends: ${misc:Depends}, ${sphinxdoc:Depends}
+Replaces: python-whoosh (<< 2.1.0)
+Description: full-text indexing, search, and spell checking library (doc)
+ Whoosh is a fast, pure-Python indexing and search library. Programmers
+ can use it to easily add search functionality to their applications and
+ websites. As Whoosh is pure Python, you don't have to compile or
+ install a binary support library and/or make Python work with a JVM, yet
+ indexing and searching is still very fast. Whoosh is designed to be
+ modular, so every part can be extended or replaced to meet your needs
+ exactly.
+ .
+ This package contains the library documentation for python-whoosh.
diff --git a/debian/copyright b/debian/copyright
new file mode 100644
index 0000000..09c67de
--- /dev/null
+++ b/debian/copyright
@@ -0,0 +1,144 @@
+Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0
+Upstream-Name: Whoosh
+Upstream-Contact: Matt Chaput <matt@whoosh.ca>
+Source: http://bitbucket.org/mchaput/whoosh/
+
+Files: *
+Copyright: 2007-2012 Matt Chaput <matt@whoosh.ca>
+License: BSD-2-clause
+
+Files: debian/*
+Copyright: 2009 Daniel Watkins <daniel@daniel-watkins.co.uk>
+ 2010-2015 أحمد المحمودي (Ahmed El-Mahmoudy) <aelmahmoudy@users.sourceforge.net>
+License: BSD-2-clause
+
+License: BSD-2-clause
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ .
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ .
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ .
+ THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+ EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ .
+ The views and conclusions contained in the software and documentation are
+ those of the authors and should not be interpreted as representing official
+ policies, either expressed or implied, of Matt Chaput.
+
+Files: src/whoosh/lang/porter2.py
+Copyright: 2008 Michael Dirolf <mike@dirolf.com>
+License: Expat
+ Permission is hereby granted, free of charge, to any person
+ obtaining a copy of this software and associated documentation
+ files (the "Software"), to deal in the Software without
+ restriction, including without limitation the rights to use,
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the
+ Software is furnished to do so, subject to the following
+ conditions:
+ .
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+ .
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ OTHER DEALINGS IN THE SOFTWARE.
+
+Files: src/whoosh/support/relativedelta.py
+Copyright: 2003-2010 Gustavo Niemeyer <gustavo@niemeyer.net>
+License: PSF
+ 1. This LICENSE AGREEMENT is between the Python Software Foundation
+ ("PSF"), and the Individual or Organization ("Licensee") accessing and
+ otherwise using this software ("Python") in source or binary form and
+ its associated documentation.
+ .
+ 2. Subject to the terms and conditions of this License Agreement, PSF
+ hereby grants Licensee a nonexclusive, royalty-free, world-wide
+ license to reproduce, analyze, test, perform and/or display publicly,
+ prepare derivative works, distribute, and otherwise use Python
+ alone or in any derivative version, provided, however, that PSF's
+ License Agreement and PSF's notice of copyright, i.e., "Copyright (c)
+ 2001, 2002, 2003, 2004, 2005, 2006, 2007 Python Software Foundation;
+ All Rights Reserved" are retained in Python alone or in any derivative
+ version prepared by Licensee.
+ .
+ 3. In the event Licensee prepares a derivative work that is based on
+ or incorporates Python or any part thereof, and wants to make
+ the derivative work available to others as provided herein, then
+ Licensee hereby agrees to include in any such work a brief summary of
+ the changes made to Python.
+ .
+ 4. PSF is making Python available to Licensee on an "AS IS"
+ basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
+ IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
+ DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
+ FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
+ INFRINGE ANY THIRD PARTY RIGHTS.
+ .
+ 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
+ FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
+ A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
+ OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+ .
+ 6. This License Agreement will automatically terminate upon a material
+ breach of its terms and conditions.
+ .
+ 7. Nothing in this License Agreement shall be deemed to create any
+ relationship of agency, partnership, or joint venture between PSF and
+ Licensee. This License Agreement does not grant permission to use PSF
+ trademarks or trade name in a trademark sense to endorse or promote
+ products or services of Licensee, or any third party.
+ .
+ 8. By copying, installing or otherwise using Python, Licensee
+ agrees to be bound by the terms and conditions of this License
+ Agreement.
+
+Files: src/whoosh/support/unicode.py
+Copyright: 1991-2008 Unicode, Inc
+License: Other
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of the Unicode data files and any associated documentation (the "Data Files")
+ or Unicode software and any associated documentation (the "Software") to deal
+ in the Data Files or Software without restriction, including without
+ limitation the rights to use, copy, modify, merge, publish, distribute, and/or
+ sell copies of the Data Files or Software, and to permit persons to whom the
+ Data Files or Software are furnished to do so, provided that (a) the above
+ copyright notice(s) and this permission notice appear with all copies of the
+ Data Files or Software, (b) both the above copyright notice(s) and this
+ permission notice appear in associated documentation, and (c) there is clear
+ notice in each modified Data File or in the Software as well as in the
+ documentation associated with the Data File(s) or Software that the data or
+ software has been modified.
+ .
+ THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+ KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD
+ PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
+ THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
+ DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE
+ DATA FILES OR SOFTWARE.
+ .
+ Except as contained in this notice, the name of a copyright holder shall not
+ be used in advertising or otherwise to promote the sale, use or other dealings
+ in these Data Files or Software without prior written authorization of the
+ copyright holder.
diff --git a/debian/python-whoosh-doc.doc-base b/debian/python-whoosh-doc.doc-base
new file mode 100644
index 0000000..161408a
--- /dev/null
+++ b/debian/python-whoosh-doc.doc-base
@@ -0,0 +1,10 @@
+Document: python-whoosh
+Title: Whoosh documentation
+Author: Matt Chaput <matt@whoosh.ca>
+Abstract: This documentation describes what Whoosh is and how it can be used to
+ develop custom search engines for your content.
+Section: Programming/Python
+
+Format: HTML
+Index: /usr/share/doc/python-whoosh-doc/html/index.html
+Files: /usr/share/doc/python-whoosh-doc/html/*.html
diff --git a/debian/python-whoosh-doc.docs b/debian/python-whoosh-doc.docs
new file mode 100644
index 0000000..ef1c0c7
--- /dev/null
+++ b/debian/python-whoosh-doc.docs
@@ -0,0 +1 @@
+docs/build/html/
diff --git a/debian/python-whoosh-doc.maintscript b/debian/python-whoosh-doc.maintscript
new file mode 100644
index 0000000..3e276e1
--- /dev/null
+++ b/debian/python-whoosh-doc.maintscript
@@ -0,0 +1 @@
+symlink_to_dir /usr/share/doc/python-whoosh-doc python-whoosh 2.5.7-2.1~
diff --git a/debian/rules b/debian/rules
new file mode 100755
index 0000000..afabdd0
--- /dev/null
+++ b/debian/rules
@@ -0,0 +1,16 @@
+#!/usr/bin/make -f
+export PYBUILD_NAME=whoosh
+
+%:
+ dh $@ --with=python2,python3,sphinxdoc --buildsystem=pybuild
+
+override_dh_auto_build:
+ dh_auto_build --buildsystem=pybuild
+ python3 setup.py build_sphinx
+
+override_dh_auto_clean:
+ dh_auto_clean
+ rm -rf docs/build
+
+override_dh_compress:
+ dh_compress -Xdoc/python-whoosh/html
diff --git a/debian/source/format b/debian/source/format
new file mode 100644
index 0000000..163aaf8
--- /dev/null
+++ b/debian/source/format
@@ -0,0 +1 @@
+3.0 (quilt)
diff --git a/debian/upstream/metadata b/debian/upstream/metadata
new file mode 100644
index 0000000..6d41d69
--- /dev/null
+++ b/debian/upstream/metadata
@@ -0,0 +1,5 @@
+Bug-Database: https://bitbucket.org/mchaput/whoosh/issues
+Contact: matt@whoosh.ca
+Homepage: http://bitbucket.org/mchaput/whoosh
+Repository: https://bitbucket.org/mchaput/whoosh
+Repository-Browse: https://bitbucket.org/mchaput/whoosh/src
diff --git a/debian/watch b/debian/watch
new file mode 100644
index 0000000..3a4da48
--- /dev/null
+++ b/debian/watch
@@ -0,0 +1,3 @@
+version=3
+opts=uversionmangle=s/(rc|a|b|c)/~$1/ \
+http://pypi.debian.net/Whoosh/Whoosh-(.+)\.(?:zip|tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz))) \ No newline at end of file
diff --git a/docs/source/analysis.rst b/docs/source/analysis.rst
new file mode 100644
index 0000000..27297f6
--- /dev/null
+++ b/docs/source/analysis.rst
@@ -0,0 +1,329 @@
+===============
+About analyzers
+===============
+
+Overview
+========
+
+An analyzer is a function or callable class (a class with a ``__call__`` method)
+that takes a unicode string and returns a generator of tokens. Usually a "token"
+is a word, for example the string "Mary had a little lamb" might yield the
+tokens "Mary", "had", "a", "little", and "lamb". However, tokens do not
+necessarily correspond to words. For example, you might tokenize Chinese text
+into individual characters or bi-grams. Tokens are the units of indexing, that
+is, they are what you are able to look up in the index.
+
+An analyzer is basically just a wrapper for a tokenizer and zero or more
+filters. The analyzer's ``__call__`` method will pass its parameters to a
+tokenizer, and the tokenizer will usually be wrapped in a few filters.
+
+A tokenizer is a callable that takes a unicode string and yields a series of
+``analysis.Token`` objects.
+
+For example, the provided :class:`whoosh.analysis.RegexTokenizer` class
+implements a customizable, regular-expression-based tokenizer that extracts
+words and ignores whitespace and punctuation.
+
+::
+
+ >>> from whoosh.analysis import RegexTokenizer
+ >>> tokenizer = RegexTokenizer()
+ >>> for token in tokenizer(u"Hello there my friend!"):
+ ... print repr(token.text)
+ u'Hello'
+ u'there'
+ u'my'
+ u'friend'
+
+A filter is a callable that takes a generator of Tokens (either a tokenizer or
+another filter) and in turn yields a series of Tokens.
+
+For example, the provided :meth:`whoosh.analysis.LowercaseFilter` filters tokens
+by converting their text to lowercase. The implementation is very simple::
+
+ def LowercaseFilter(tokens):
+ """Uses lower() to lowercase token text. For example, tokens
+ "This","is","a","TEST" become "this","is","a","test".
+ """
+
+ for t in tokens:
+ t.text = t.text.lower()
+ yield t
+
+You can wrap the filter around a tokenizer to see it in operation::
+
+ >>> from whoosh.analysis import LowercaseFilter
+ >>> for token in LowercaseFilter(tokenizer(u"These ARE the things I want!")):
+ ... print repr(token.text)
+ u'these'
+ u'are'
+ u'the'
+ u'things'
+ u'i'
+ u'want'
+
+An analyzer is just a means of combining a tokenizer and some filters into a
+single package.
+
+You can implement an analyzer as a custom class or function, or compose
+tokenizers and filters together using the ``|`` character::
+
+ my_analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter()
+
+The first item must be a tokenizer and the rest must be filters (you can't put a
+filter first or a tokenizer after the first item). Note that this only works if at
+least the tokenizer is a subclass of ``whoosh.analysis.Composable``, as all the
+tokenizers and filters that ship with Whoosh are.
+
+See the :mod:`whoosh.analysis` module for information on the available analyzers,
+tokenizers, and filters shipped with Whoosh.
+
+
+Using analyzers
+===============
+
+When you create a field in a schema, you can specify your analyzer as a keyword
+argument to the field object::
+
+ schema = Schema(content=TEXT(analyzer=StemmingAnalyzer()))
+
+
+Advanced Analysis
+=================
+
+Token objects
+-------------
+
+The ``Token`` class has no methods. It is merely a place to record certain
+attributes. A ``Token`` object actually has two kinds of attributes: *settings*
+that record what kind of information the ``Token`` object does or should contain,
+and *information* about the current token.
+
+
+Token setting attributes
+------------------------
+
+A ``Token`` object should always have the following attributes. A tokenizer or
+filter can check these attributes to see what kind of information is available
+and/or what kind of information they should be setting on the ``Token`` object.
+
+These attributes are set by the tokenizer when it creates the Token(s), based on
+the parameters passed to it from the Analyzer.
+
+Filters **should not** change the values of these attributes.
+
+====== ================ =================================================== =========
+Type Attribute name Description Default
+====== ================ =================================================== =========
+str mode The mode in which the analyzer is being called, ''
+ e.g. 'index' during indexing or 'query' during
+ query parsing
+bool positions Whether term positions are recorded in the token False
+bool chars Whether term start and end character indices are False
+ recorded in the token
+bool boosts Whether per-term boosts are recorded in the token False
+bool removestops Whether stop-words should be removed from the True
+ token stream
+====== ================ =================================================== =========
+
+
+Token information attributes
+----------------------------
+
+A ``Token`` object may have any of the following attributes. The ``text`` attribute
+should always be present. The original attribute may be set by a tokenizer. All
+other attributes should only be accessed or set based on the values of the
+"settings" attributes above.
+
+======== ========== =================================================================
+Type Name Description
+======== ========== =================================================================
+unicode text The text of the token (this should always be present)
+unicode original The original (pre-filtered) text of the token. The tokenizer may
+ record this, and filters are expected not to modify it.
+int pos The position of the token in the stream, starting at 0
+ (only set if positions is True)
+int startchar The character index of the start of the token in the original
+ string (only set if chars is True)
+int endchar The character index of the end of the token in the original
+ string (only set if chars is True)
+float boost The boost for this token (only set if boosts is True)
+bool stopped Whether this token is a "stop" word
+ (only set if removestops is False)
+======== ========== =================================================================
+
+So why are most of the information attributes optional? Different field formats
+require different levels of information about each token. For example, the
+``Frequency`` format only needs the token text. The ``Positions`` format records term
+positions, so it needs them on the ``Token``. The ``Characters`` format records term
+positions and the start and end character indices of each term, so it needs them
+on the token, and so on.
+
+The ``Format`` object that represents the format of each field calls the analyzer
+for the field, and passes it parameters corresponding to the types of
+information it needs, e.g.::
+
+ analyzer(unicode_string, positions=True)
+
+The analyzer can then pass that information to a tokenizer so the tokenizer
+initializes the required attributes on the ``Token`` object(s) it produces.
+
+
+Performing different analysis for indexing and query parsing
+------------------------------------------------------------
+
+Whoosh sets the ``mode`` setting attribute to indicate whether the analyzer is
+being called by the indexer (``mode='index'``) or the query parser
+(``mode='query'``). This is useful if there's a transformation that you only
+want to apply at indexing or query parsing::
+
+ class MyFilter(Filter):
+ def __call__(self, tokens):
+ for t in tokens:
+ if t.mode == 'query':
+ ...
+ else:
+ ...
+
+The :class:`whoosh.analysis.MultiFilter` filter class lets you specify different
+filters to use based on the mode setting::
+
+ intraword = MultiFilter(index=IntraWordFilter(mergewords=True, mergenums=True),
+ query=IntraWordFilter(mergewords=False, mergenums=False))
+
+
+Stop words
+----------
+
+"Stop" words are words that are so common it's often counter-productive to index
+them, such as "and", "or", "if", etc. The provided ``analysis.StopFilter`` lets you
+filter out stop words, and includes a default list of common stop words.
+
+::
+
+ >>> from whoosh.analysis import StopFilter
+ >>> stopper = StopFilter()
+ >>> for token in stopper(LowercaseFilter(tokenizer(u"These ARE the things I want!"))):
+ ... print repr(token.text)
+ u'these'
+ u'things'
+ u'want'
+
+However, this seemingly simple filter idea raises a couple of minor but slightly
+thorny issues: renumbering term positions and keeping or removing stopped words.
+
+
+Renumbering term positions
+--------------------------
+
+Remember that analyzers are sometimes asked to record the position of each token
+in the token stream:
+
+============= ========== ========== ========== ==========
+Token.text u'Mary' u'had' u'a' u'lamb'
+Token.pos 0 1 2 3
+============= ========== ========== ========== ==========
+
+So what happens to the ``pos`` attribute of the tokens if ``StopFilter`` removes
+the words ``had`` and ``a`` from the stream? Should it renumber the positions to
+pretend the "stopped" words never existed? I.e.:
+
+============= ========== ==========
+Token.text u'Mary' u'lamb'
+Token.pos 0 1
+============= ========== ==========
+
+or should it preserve the original positions of the words? I.e:
+
+============= ========== ==========
+Token.text u'Mary' u'lamb'
+Token.pos 0 3
+============= ========== ==========
+
+It turns out that different situations call for different solutions, so the
+provided ``StopFilter`` class supports both of the above behaviors. Renumbering
+is the default, since that is usually the most useful and is necessary to
+support phrase searching. However, you can set a parameter in StopFilter's
+constructor to tell it not to renumber positions::
+
+ stopper = StopFilter(renumber=False)
+
+
+Removing or leaving stop words
+------------------------------
+
+The point of using ``StopFilter`` is to remove stop words, right? Well, there
+are actually some situations where you might want to mark tokens as "stopped"
+but not remove them from the token stream.
+
+For example, if you were writing your own query parser, you could run the user's
+query through a field's analyzer to break it into tokens. In that case, you
+might want to know which words were "stopped" so you can provide helpful
+feedback to the end user (e.g. "The following words are too common to search
+for:").
+
+In other cases, you might want to leave stopped words in the stream for certain
+filtering steps (for example, you might have a step that looks at previous
+tokens, and want the stopped tokens to be part of the process), but then remove
+them later.
+
+The ``analysis`` module provides a couple of tools for keeping and removing
+stop-words in the stream.
+
+The ``removestops`` parameter passed to the analyzer's ``__call__`` method (and
+copied to the ``Token`` object as an attribute) specifies whether stop words should
+be removed from the stream or left in.
+
+::
+
+ >>> from whoosh.analysis import StandardAnalyzer
+ >>> analyzer = StandardAnalyzer()
+ >>> [(t.text, t.stopped) for t in analyzer(u"This is a test")]
+ [(u'test', False)]
+ >>> [(t.text, t.stopped) for t in analyzer(u"This is a test", removestops=False)]
+ [(u'this', True), (u'is', True), (u'a', True), (u'test', False)]
+
+The ``analysis.unstopped()`` filter function takes a token generator and yields
+only the tokens whose ``stopped`` attribute is ``False``.
+
+.. note::
+ Even if you leave stopped words in the stream in an analyzer you use for
+ indexing, the indexer will ignore any tokens where the ``stopped``
+ attribute is ``True``.
+
+
+Implementation notes
+--------------------
+
+Because object creation is slow in Python, the stock tokenizers do not create a
+new ``analysis.Token`` object for each token. Instead, they create one ``Token`` object
+and yield it over and over. This is a nice performance shortcut but can lead to
+strange behavior if your code tries to remember tokens between loops of the
+generator.
+
+Because the analyzer only has one ``Token`` object, of which it keeps changing the
+attributes, if you keep a copy of the Token you get from a loop of the
+generator, it will be changed from under you. For example::
+
+ >>> list(tokenizer(u"Hello there my friend"))
+ [Token(u"friend"), Token(u"friend"), Token(u"friend"), Token(u"friend")]
+
+Instead, do this::
+
+ >>> [t.text for t in tokenizer(u"Hello there my friend")]
+
+That is, save the attributes, not the token object itself.
+
+If you implement your own tokenizer, filter, or analyzer as a class, you should
+implement an ``__eq__`` method. This is important to allow comparison of ``Schema``
+objects.
+
+The mixing of persistent "setting" and transient "information" attributes on the
+``Token`` object is not especially elegant. If I ever have a better idea I might
+change it. ;) Nothing requires that an Analyzer be implemented by calling a
+tokenizer and filters. Tokenizers and filters are simply a convenient way to
+structure the code. You're free to write an analyzer any way you want, as long
+as it implements ``__call__``.
+
+
+
diff --git a/docs/source/api/analysis.rst b/docs/source/api/analysis.rst
new file mode 100644
index 0000000..d31e96a
--- /dev/null
+++ b/docs/source/api/analysis.rst
@@ -0,0 +1,62 @@
+===================
+``analysis`` module
+===================
+
+.. automodule:: whoosh.analysis
+
+Analyzers
+=========
+
+.. autoclass:: IDAnalyzer
+.. autoclass:: KeywordAnalyzer
+.. autoclass:: RegexAnalyzer
+.. autoclass:: SimpleAnalyzer
+.. autoclass:: StandardAnalyzer
+.. autoclass:: StemmingAnalyzer
+.. autoclass:: FancyAnalyzer
+.. autoclass:: NgramAnalyzer
+.. autoclass:: NgramWordAnalyzer
+.. autoclass:: LanguageAnalyzer
+
+
+Tokenizers
+==========
+
+.. autoclass:: IDTokenizer
+.. autoclass:: RegexTokenizer
+.. autoclass:: CharsetTokenizer
+.. autoclass:: SpaceSeparatedTokenizer
+.. autoclass:: CommaSeparatedTokenizer
+.. autoclass:: NgramTokenizer
+.. autoclass:: PathTokenizer
+
+
+Filters
+=======
+
+.. autoclass:: PassFilter
+.. autoclass:: LoggingFilter
+.. autoclass:: MultiFilter
+.. autoclass:: TeeFilter
+.. autoclass:: ReverseTextFilter
+.. autoclass:: LowercaseFilter
+.. autoclass:: StripFilter
+.. autoclass:: StopFilter
+.. autoclass:: StemFilter
+.. autoclass:: CharsetFilter
+.. autoclass:: NgramFilter
+.. autoclass:: IntraWordFilter
+.. autoclass:: CompoundWordFilter
+.. autoclass:: BiWordFilter
+.. autoclass:: ShingleFilter
+.. autoclass:: DelimitedAttributeFilter
+.. autoclass:: DoubleMetaphoneFilter
+.. autoclass:: SubstitutionFilter
+
+
+Token classes and functions
+===========================
+
+.. autoclass:: Token
+.. autofunction:: unstopped
+
diff --git a/docs/source/api/api.rst b/docs/source/api/api.rst
new file mode 100644
index 0000000..f74a3c3
--- /dev/null
+++ b/docs/source/api/api.rst
@@ -0,0 +1,9 @@
+==========
+Whoosh API
+==========
+
+.. toctree::
+ :glob:
+ :maxdepth: 1
+
+ **
diff --git a/docs/source/api/codec/base.rst b/docs/source/api/codec/base.rst
new file mode 100644
index 0000000..28f707c
--- /dev/null
+++ b/docs/source/api/codec/base.rst
@@ -0,0 +1,32 @@
+=====================
+``codec.base`` module
+=====================
+
+.. automodule:: whoosh.codec.base
+
+
+Classes
+=======
+
+.. autoclass:: Codec
+ :members:
+
+.. autoclass:: PerDocumentWriter
+ :members:
+
+.. autoclass:: FieldWriter
+ :members:
+
+.. autoclass:: PostingsWriter
+ :members:
+
+.. autoclass:: TermsReader
+ :members:
+
+.. autoclass:: PerDocumentReader
+ :members:
+
+.. autoclass:: Segment
+ :members:
+
+
diff --git a/docs/source/api/collectors.rst b/docs/source/api/collectors.rst
new file mode 100644
index 0000000..b27b8c1
--- /dev/null
+++ b/docs/source/api/collectors.rst
@@ -0,0 +1,47 @@
+=====================
+``collectors`` module
+=====================
+
+.. automodule:: whoosh.collectors
+
+
+Base classes
+============
+
+.. autoclass:: Collector
+ :members:
+
+.. autoclass:: ScoredCollector
+ :members:
+
+.. autoclass:: WrappingCollector
+ :members:
+
+
+Basic collectors
+================
+
+.. autoclass:: TopCollector
+
+.. autoclass:: UnlimitedCollector
+
+.. autoclass:: SortingCollector
+
+
+Wrappers
+========
+
+.. autoclass:: FilterCollector
+
+.. autoclass:: FacetCollector
+
+.. autoclass:: CollapseCollector
+
+.. autoclass:: TimeLimitCollector
+
+.. autoclass:: TermsCollector
+
+
+
+
+
diff --git a/docs/source/api/columns.rst b/docs/source/api/columns.rst
new file mode 100644
index 0000000..26fa791
--- /dev/null
+++ b/docs/source/api/columns.rst
@@ -0,0 +1,49 @@
+=====================
+``columns`` module
+=====================
+
+.. automodule:: whoosh.columns
+
+
+Base classes
+============
+
+.. autoclass:: Column
+ :members:
+
+.. autoclass:: ColumnWriter
+ :members:
+
+.. autoclass:: ColumnReader
+ :members:
+
+
+Basic columns
+=============
+
+.. autoclass:: VarBytesColumn
+
+.. autoclass:: FixedBytesColumn
+
+.. autoclass:: RefBytesColumn
+
+.. autoclass:: NumericColumn
+
+
+Technical columns
+=================
+
+.. autoclass:: BitColumn
+
+.. autoclass:: CompressedBytesColumn
+
+.. autoclass:: StructColumn
+
+.. autoclass:: PickleColumn
+
+
+Experimental columns
+====================
+
+.. autoclass:: ClampedNumericColumn
+
diff --git a/docs/source/api/fields.rst b/docs/source/api/fields.rst
new file mode 100644
index 0000000..290feb3
--- /dev/null
+++ b/docs/source/api/fields.rst
@@ -0,0 +1,41 @@
+=================
+``fields`` module
+=================
+
+.. automodule:: whoosh.fields
+
+Schema class
+============
+
+.. autoclass:: Schema
+ :members:
+
+.. autoclass:: SchemaClass
+
+FieldType base class
+====================
+
+.. autoclass:: FieldType
+ :members:
+
+
+Pre-made field types
+====================
+
+.. autoclass:: ID
+.. autoclass:: IDLIST
+.. autoclass:: STORED
+.. autoclass:: KEYWORD
+.. autoclass:: TEXT
+.. autoclass:: NUMERIC
+.. autoclass:: DATETIME
+.. autoclass:: BOOLEAN
+.. autoclass:: NGRAM
+.. autoclass:: NGRAMWORDS
+
+
+Exceptions
+==========
+
+.. autoexception:: FieldConfigurationError
+.. autoexception:: UnknownFieldError
diff --git a/docs/source/api/filedb/filestore.rst b/docs/source/api/filedb/filestore.rst
new file mode 100644
index 0000000..2dfc2ec
--- /dev/null
+++ b/docs/source/api/filedb/filestore.rst
@@ -0,0 +1,31 @@
+===========================
+``filedb.filestore`` module
+===========================
+
+.. automodule:: whoosh.filedb.filestore
+
+Base class
+==========
+
+.. autoclass:: Storage
+ :members:
+
+
+Implementation classes
+======================
+
+.. autoclass:: FileStorage
+.. autoclass:: RamStorage
+
+
+Helper functions
+================
+
+.. autofunction:: copy_storage
+.. autofunction:: copy_to_ram
+
+
+Exceptions
+==========
+
+.. autoexception:: ReadOnlyError
diff --git a/docs/source/api/filedb/filetables.rst b/docs/source/api/filedb/filetables.rst
new file mode 100644
index 0000000..3fbf70f
--- /dev/null
+++ b/docs/source/api/filedb/filetables.rst
@@ -0,0 +1,22 @@
+============================
+``filedb.filetables`` module
+============================
+
+.. automodule:: whoosh.filedb.filetables
+
+
+Hash file
+=========
+
+.. autoclass:: HashWriter
+ :members:
+
+.. autoclass:: HashReader
+ :members:
+
+
+Ordered Hash file
+=================
+
+.. autoclass:: OrderedHashWriter
+.. autoclass:: OrderedHashReader
diff --git a/docs/source/api/filedb/structfile.rst b/docs/source/api/filedb/structfile.rst
new file mode 100644
index 0000000..7d45c66
--- /dev/null
+++ b/docs/source/api/filedb/structfile.rst
@@ -0,0 +1,14 @@
+============================
+``filedb.structfile`` module
+============================
+
+.. automodule:: whoosh.filedb.structfile
+
+Classes
+=======
+
+.. autoclass:: StructFile
+ :members:
+
+.. autoclass:: BufferFile
+.. autoclass:: ChecksumFile
diff --git a/docs/source/api/formats.rst b/docs/source/api/formats.rst
new file mode 100644
index 0000000..9cd9dd1
--- /dev/null
+++ b/docs/source/api/formats.rst
@@ -0,0 +1,24 @@
+==================
+``formats`` module
+==================
+
+.. automodule:: whoosh.formats
+
+Base class
+==========
+
+.. autoclass:: Format
+ :members:
+
+
+Formats
+=======
+
+.. autoclass:: Existence
+.. autoclass:: Frequency
+.. autoclass:: Positions
+.. autoclass:: Characters
+.. autoclass:: PositionBoosts
+.. autoclass:: CharacterBoosts
+
+
diff --git a/docs/source/api/highlight.rst b/docs/source/api/highlight.rst
new file mode 100644
index 0000000..74d2ab9
--- /dev/null
+++ b/docs/source/api/highlight.rst
@@ -0,0 +1,50 @@
+====================
+``highlight`` module
+====================
+
+.. automodule:: whoosh.highlight
+
+See :doc:`how to highlight terms in search results </highlight>`.
+
+
+Manual highlighting
+===================
+
+.. autoclass:: Highlighter
+ :members:
+
+.. autofunction:: highlight
+
+
+Fragmenters
+===========
+
+.. autoclass:: Fragmenter
+ :members:
+
+.. autoclass:: WholeFragmenter
+.. autoclass:: SentenceFragmenter
+.. autoclass:: ContextFragmenter
+.. autoclass:: PinpointFragmenter
+
+
+Scorers
+=======
+
+.. autoclass:: FragmentScorer
+.. autoclass:: BasicFragmentScorer
+
+
+Formatters
+==========
+
+.. autoclass:: UppercaseFormatter
+.. autoclass:: HtmlFormatter
+.. autoclass:: GenshiFormatter
+
+
+Utility classes
+===============
+
+.. autoclass:: Fragment
+ :members:
diff --git a/docs/source/api/idsets.rst b/docs/source/api/idsets.rst
new file mode 100644
index 0000000..0f55306
--- /dev/null
+++ b/docs/source/api/idsets.rst
@@ -0,0 +1,23 @@
+============================
+``support.bitvector`` module
+============================
+
+.. automodule:: whoosh.idsets
+
+
+Base classes
+============
+
+.. autoclass:: DocIdSet
+ :members:
+
+.. autoclass:: BaseBitSet
+
+
+Implementation classes
+======================
+
+.. autoclass:: BitSet
+.. autoclass:: OnDiskBitSet
+.. autoclass:: SortedIntSet
+.. autoclass:: MultiIdSet
diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst
new file mode 100644
index 0000000..ee38645
--- /dev/null
+++ b/docs/source/api/index.rst
@@ -0,0 +1,39 @@
+================
+``index`` module
+================
+
+.. automodule:: whoosh.index
+
+
+Functions
+=========
+
+.. autofunction:: create_in
+.. autofunction:: open_dir
+.. autofunction:: exists_in
+.. autofunction:: exists
+.. autofunction:: version_in
+.. autofunction:: version
+
+
+Base class
+==========
+
+.. autoclass:: Index
+ :members:
+
+
+Implementation
+==============
+
+.. autoclass:: FileIndex
+
+
+Exceptions
+==========
+
+.. autoexception:: LockError
+.. autoexception:: IndexError
+.. autoexception:: IndexVersionError
+.. autoexception:: OutOfDateError
+.. autoexception:: EmptyIndexError
diff --git a/docs/source/api/lang/morph_en.rst b/docs/source/api/lang/morph_en.rst
new file mode 100644
index 0000000..2a3dfe0
--- /dev/null
+++ b/docs/source/api/lang/morph_en.rst
@@ -0,0 +1,7 @@
+========================
+``lang.morph_en`` module
+========================
+
+.. automodule:: whoosh.lang.morph_en
+
+.. autofunction:: variations
diff --git a/docs/source/api/lang/porter.rst b/docs/source/api/lang/porter.rst
new file mode 100644
index 0000000..4a0220f
--- /dev/null
+++ b/docs/source/api/lang/porter.rst
@@ -0,0 +1,7 @@
+======================
+``lang.porter`` module
+======================
+
+.. automodule:: whoosh.lang.porter
+
+.. autofunction:: stem
diff --git a/docs/source/api/lang/wordnet.rst b/docs/source/api/lang/wordnet.rst
new file mode 100644
index 0000000..8adcdb0
--- /dev/null
+++ b/docs/source/api/lang/wordnet.rst
@@ -0,0 +1,20 @@
+========================
+``lang.wordnet`` module
+========================
+
+.. automodule:: whoosh.lang.wordnet
+
+Thesaurus
+=========
+
+.. autoclass:: Thesaurus
+ :members:
+
+
+Low-level functions
+===================
+
+.. autofunction:: parse_file
+.. autofunction:: synonyms
+.. autofunction:: make_index
+
diff --git a/docs/source/api/matching.rst b/docs/source/api/matching.rst
new file mode 100644
index 0000000..12f24c6
--- /dev/null
+++ b/docs/source/api/matching.rst
@@ -0,0 +1,34 @@
+===================
+``matching`` module
+===================
+
+.. automodule:: whoosh.matching
+
+Matchers
+========
+
+.. autoclass:: Matcher
+ :members:
+
+.. autoclass:: NullMatcher
+.. autoclass:: ListMatcher
+.. autoclass:: WrappingMatcher
+.. autoclass:: MultiMatcher
+.. autoclass:: FilterMatcher
+.. autoclass:: BiMatcher
+.. autoclass:: AdditiveBiMatcher
+.. autoclass:: UnionMatcher
+.. autoclass:: DisjunctionMaxMatcher
+.. autoclass:: IntersectionMatcher
+.. autoclass:: AndNotMatcher
+.. autoclass:: InverseMatcher
+.. autoclass:: RequireMatcher
+.. autoclass:: AndMaybeMatcher
+.. autoclass:: ConstantScoreMatcher
+
+
+Exceptions
+==========
+
+.. autoexception:: ReadTooFar
+.. autoexception:: NoQualityAvailable
diff --git a/docs/source/api/qparser.rst b/docs/source/api/qparser.rst
new file mode 100644
index 0000000..d3c5ecd
--- /dev/null
+++ b/docs/source/api/qparser.rst
@@ -0,0 +1,97 @@
+==================
+``qparser`` module
+==================
+
+.. automodule:: whoosh.qparser
+
+Parser object
+=============
+
+.. autoclass:: QueryParser
+ :members:
+
+Pre-made configurations
+-----------------------
+
+The following functions return pre-configured QueryParser objects.
+
+.. autofunction:: MultifieldParser
+
+.. autofunction:: SimpleParser
+
+.. autofunction:: DisMaxParser
+
+
+Plug-ins
+========
+
+.. autoclass:: Plugin
+ :members:
+
+.. autoclass:: SingleQuotePlugin
+.. autoclass:: PrefixPlugin
+.. autoclass:: WildcardPlugin
+.. autoclass:: RegexPlugin
+.. autoclass:: BoostPlugin
+.. autoclass:: GroupPlugin
+.. autoclass:: EveryPlugin
+.. autoclass:: FieldsPlugin
+.. autoclass:: PhrasePlugin
+.. autoclass:: RangePlugin
+.. autoclass:: OperatorsPlugin
+.. autoclass:: PlusMinusPlugin
+.. autoclass:: GtLtPlugin
+.. autoclass:: MultifieldPlugin
+.. autoclass:: FieldAliasPlugin
+.. autoclass:: CopyFieldPlugin
+
+
+Syntax node objects
+===================
+
+Base nodes
+----------
+
+.. autoclass:: SyntaxNode
+ :members:
+
+
+Nodes
+-----
+
+.. autoclass:: FieldnameNode
+.. autoclass:: TextNode
+.. autoclass:: WordNode
+.. autoclass:: RangeNode
+.. autoclass:: MarkerNode
+
+
+Group nodes
+-----------
+
+.. autoclass:: GroupNode
+.. autoclass:: BinaryGroup
+.. autoclass:: ErrorNode
+.. autoclass:: AndGroup
+.. autoclass:: OrGroup
+.. autoclass:: AndNotGroup
+.. autoclass:: AndMaybeGroup
+.. autoclass:: DisMaxGroup
+.. autoclass:: RequireGroup
+.. autoclass:: NotGroup
+
+
+Operators
+---------
+
+.. autoclass:: Operator
+.. autoclass:: PrefixOperator
+.. autoclass:: PostfixOperator
+.. autoclass:: InfixOperator
+
+
+
+
+
+
+
diff --git a/docs/source/api/query.rst b/docs/source/api/query.rst
new file mode 100644
index 0000000..9a7e9ff
--- /dev/null
+++ b/docs/source/api/query.rst
@@ -0,0 +1,83 @@
+================
+``query`` module
+================
+
+.. automodule:: whoosh.query
+
+See also :mod:`whoosh.qparser` which contains code for parsing user queries
+into query objects.
+
+Base classes
+============
+
+The following abstract base classes are subclassed to create the "real"
+query operations.
+
+.. autoclass:: Query
+ :members:
+
+.. autoclass:: CompoundQuery
+.. autoclass:: MultiTerm
+.. autoclass:: ExpandingTerm
+.. autoclass:: WrappingQuery
+
+
+Query classes
+=============
+
+.. autoclass:: Term
+.. autoclass:: Variations
+.. autoclass:: FuzzyTerm
+.. autoclass:: Phrase
+.. autoclass:: And
+.. autoclass:: Or
+.. autoclass:: DisjunctionMax
+.. autoclass:: Not
+.. autoclass:: Prefix
+.. autoclass:: Wildcard
+.. autoclass:: Regex
+.. autoclass:: TermRange
+.. autoclass:: NumericRange
+.. autoclass:: DateRange
+.. autoclass:: Every
+.. autoclass:: NullQuery
+
+
+Binary queries
+==============
+
+.. autoclass:: Require
+.. autoclass:: AndMaybe
+.. autoclass:: AndNot
+.. autoclass:: Otherwise
+
+
+Span queries
+============
+
+.. autoclass:: Span
+ :members:
+
+.. autoclass:: SpanQuery
+.. autoclass:: SpanFirst
+.. autoclass:: SpanNear
+.. autoclass:: SpanNear2
+.. autoclass:: SpanNot
+.. autoclass:: SpanOr
+.. autoclass:: SpanContains
+.. autoclass:: SpanBefore
+.. autoclass:: SpanCondition
+
+
+Special queries
+===============
+
+.. autoclass:: NestedParent
+.. autoclass:: NestedChildren
+.. autoclass:: ConstantScoreQuery
+
+
+Exceptions
+==========
+
+.. autoexception:: QueryError
diff --git a/docs/source/api/reading.rst b/docs/source/api/reading.rst
new file mode 100644
index 0000000..e0fd2a1
--- /dev/null
+++ b/docs/source/api/reading.rst
@@ -0,0 +1,22 @@
+==================
+``reading`` module
+==================
+
+.. automodule:: whoosh.reading
+
+Classes
+=======
+
+.. autoclass:: IndexReader
+ :members:
+
+.. autoclass:: MultiReader
+
+.. autoclass:: TermInfo
+ :members:
+
+Exceptions
+==========
+
+.. autoexception:: TermNotFound
+
diff --git a/docs/source/api/scoring.rst b/docs/source/api/scoring.rst
new file mode 100644
index 0000000..73ea1e7
--- /dev/null
+++ b/docs/source/api/scoring.rst
@@ -0,0 +1,42 @@
+==================
+``scoring`` module
+==================
+
+.. automodule:: whoosh.scoring
+
+
+Base classes
+============
+
+.. autoclass:: WeightingModel
+ :members:
+
+.. autoclass:: BaseScorer
+ :members:
+
+.. autoclass:: WeightScorer
+.. autoclass:: WeightLengthScorer
+
+
+Scoring algorithm classes
+=========================
+
+.. autoclass:: BM25F
+
+.. autoclass:: TF_IDF
+
+.. autoclass:: Frequency
+
+
+Scoring utility classes
+=======================
+
+.. autoclass:: FunctionWeighting
+
+.. autoclass:: MultiWeighting
+
+.. autoclass:: ReverseWeighting
+
+
+
+
diff --git a/docs/source/api/searching.rst b/docs/source/api/searching.rst
new file mode 100644
index 0000000..8acfe49
--- /dev/null
+++ b/docs/source/api/searching.rst
@@ -0,0 +1,33 @@
+====================
+``searching`` module
+====================
+
+.. automodule:: whoosh.searching
+
+
+Searching classes
+=================
+
+.. autoclass:: Searcher
+ :members:
+
+
+Results classes
+===============
+
+.. autoclass:: Results
+ :members:
+
+.. autoclass:: Hit
+ :members:
+
+.. autoclass:: ResultsPage
+ :members:
+
+
+Exceptions
+==========
+
+.. autoexception:: NoTermsException
+.. autoexception:: TimeLimit
+
diff --git a/docs/source/api/sorting.rst b/docs/source/api/sorting.rst
new file mode 100644
index 0000000..faf78d0
--- /dev/null
+++ b/docs/source/api/sorting.rst
@@ -0,0 +1,48 @@
+==================
+``sorting`` module
+==================
+
+.. automodule:: whoosh.sorting
+
+
+Base types
+==========
+
+.. autoclass:: FacetType
+ :members:
+
+.. autoclass:: Categorizer
+ :members:
+
+
+Facet types
+===========
+
+.. autoclass:: FieldFacet
+.. autoclass:: QueryFacet
+.. autoclass:: RangeFacet
+.. autoclass:: DateRangeFacet
+.. autoclass:: ScoreFacet
+.. autoclass:: FunctionFacet
+.. autoclass:: MultiFacet
+.. autoclass:: StoredFieldFacet
+
+
+Facets object
+=============
+
+.. autoclass:: Facets
+ :members:
+
+
+FacetType objects
+=================
+
+.. autoclass:: FacetMap
+ :members:
+.. autoclass:: OrderedList
+.. autoclass:: UnorderedList
+.. autoclass:: Count
+.. autoclass:: Best
+
+
diff --git a/docs/source/api/spelling.rst b/docs/source/api/spelling.rst
new file mode 100644
index 0000000..e89bb79
--- /dev/null
+++ b/docs/source/api/spelling.rst
@@ -0,0 +1,34 @@
+===================
+``spelling`` module
+===================
+
+See :doc:`correcting errors in user queries <../spelling>`.
+
+.. automodule:: whoosh.spelling
+
+
+Corrector objects
+=================
+
+.. autoclass:: Corrector
+ :members:
+
+.. autoclass:: ReaderCorrector
+
+.. autoclass:: GraphCorrector
+ :members:
+
+.. autoclass:: MultiCorrector
+
+
+QueryCorrector objects
+======================
+
+.. autoclass:: QueryCorrector
+ :members:
+
+.. autoclass:: SimpleQueryCorrector
+
+.. autoclass:: Correction
+
+
diff --git a/docs/source/api/support/charset.rst b/docs/source/api/support/charset.rst
new file mode 100644
index 0000000..b0a687e
--- /dev/null
+++ b/docs/source/api/support/charset.rst
@@ -0,0 +1,13 @@
+==========================
+``support.charset`` module
+==========================
+
+.. automodule:: whoosh.support.charset
+
+.. data:: default_charset
+
+ An extensive case- and accent folding charset table.
+ Taken from http://speeple.com/unicode-maps.txt
+
+.. autofunction:: charset_table_to_dict
+
diff --git a/docs/source/api/support/levenshtein.rst b/docs/source/api/support/levenshtein.rst
new file mode 100644
index 0000000..cb64027
--- /dev/null
+++ b/docs/source/api/support/levenshtein.rst
@@ -0,0 +1,10 @@
+==============================
+``support.levenshtein`` module
+==============================
+
+.. automodule:: whoosh.support.levenshtein
+
+.. autofunction:: relative
+
+.. autofunction:: distance
+
diff --git a/docs/source/api/util.rst b/docs/source/api/util.rst
new file mode 100644
index 0000000..9359f74
--- /dev/null
+++ b/docs/source/api/util.rst
@@ -0,0 +1,7 @@
+===============
+``util`` module
+===============
+
+.. automodule:: whoosh.util
+ :members:
+
diff --git a/docs/source/api/writing.rst b/docs/source/api/writing.rst
new file mode 100644
index 0000000..0bebc86
--- /dev/null
+++ b/docs/source/api/writing.rst
@@ -0,0 +1,30 @@
+==================
+``writing`` module
+==================
+
+.. automodule:: whoosh.writing
+
+
+Writer
+======
+
+.. autoclass:: IndexWriter
+ :members:
+
+
+Utility writers
+===============
+
+.. autoclass:: BufferedWriter
+ :members:
+
+.. autoclass:: AsyncWriter
+ :members:
+
+
+Exceptions
+==========
+
+.. autoexception:: IndexingError
+
+
diff --git a/docs/source/batch.rst b/docs/source/batch.rst
new file mode 100644
index 0000000..5caf256
--- /dev/null
+++ b/docs/source/batch.rst
@@ -0,0 +1,114 @@
+===================================
+Tips for speeding up batch indexing
+===================================
+
+
+Overview
+========
+
+Indexing documents tends to fall into two general patterns: adding documents
+one at a time as they are created (as in a web application), and adding a bunch
+of documents at once (batch indexing).
+
+The following settings and alternate workflows can make batch indexing faster.
+
+
+StemmingAnalyzer cache
+======================
+
+The stemming analyzer by default uses a least-recently-used (LRU) cache to limit
+the amount of memory it uses, to prevent the cache from growing very large if
+the analyzer is reused for a long period of time. However, the LRU cache can
+slow down indexing by almost 200% compared to a stemming analyzer with an
+"unbounded" cache.
+
+When you're indexing in large batches with a one-shot instance of the
+analyzer, consider using an unbounded cache::
+
+ w = myindex.writer()
+ # Get the analyzer object from a text field
+ stem_ana = w.schema["content"].format.analyzer
+ # Set the cachesize to -1 to indicate unbounded caching
+ stem_ana.cachesize = -1
+ # Reset the analyzer to pick up the changed attribute
+ stem_ana.clear()
+
+ # Use the writer to index documents...
+
+
+The ``limitmb`` parameter
+=========================
+
+The ``limitmb`` parameter to :meth:`whoosh.index.Index.writer` controls the
+*maximum* memory (in megabytes) the writer will use for the indexing pool. The
+higher the number, the faster indexing will be.
+
+The default value of ``128`` is actually somewhat low, considering many people
+have multiple gigabytes of RAM these days. Setting it higher can speed up
+indexing considerably::
+
+ from whoosh import index
+
+ ix = index.open_dir("indexdir")
+ writer = ix.writer(limitmb=256)
+
+.. note::
+ The actual memory used will be higher than this value because of interpreter
+ overhead (up to twice as much!). It is very useful as a tuning parameter,
+ but not for trying to exactly control the memory usage of Whoosh.
+
+
+The ``procs`` parameter
+=======================
+
+The ``procs`` parameter to :meth:`whoosh.index.Index.writer` controls the
+number of processors the writer will use for indexing (via the
+``multiprocessing`` module)::
+
+ from whoosh import index
+
+ ix = index.open_dir("indexdir")
+ writer = ix.writer(procs=4)
+
+Note that when you use multiprocessing, the ``limitmb`` parameter controls the
+amount of memory used by *each process*, so the actual memory used will be
+``limitmb * procs``::
+
+ # Each process will use a limit of 128, for a total of 512
+ writer = ix.writer(procs=4, limitmb=128)
+
+
+The ``multisegment`` parameter
+==============================
+
+The ``procs`` parameter causes the default writer to use multiple processors to
+do much of the indexing, but then still uses a single process to merge the pool
+of each sub-writer into a single segment.
+
+You can get much better indexing speed by also using the ``multisegment=True``
+keyword argument, which instead of merging the results of each sub-writer,
+simply has them each just write out a new segment::
+
+ from whoosh import index
+
+ ix = index.open_dir("indexdir")
+ writer = ix.writer(procs=4, multisegment=True)
+
+The drawback is that instead
+of creating a single new segment, this option creates a number of new segments
+**at least** equal to the number of processes you use.
+
+For example, if you use ``procs=4``, the writer will create four new segments.
+(If you merge old segments or call ``add_reader`` on the parent writer, the
+parent writer will also write a segment, meaning you'll get five new segments.)
+
+So, while ``multisegment=True`` is much faster than a normal writer, you should
+only use it for large batch indexing jobs (or perhaps only for indexing from
+scratch). It should not be the only method you use for indexing, because
+otherwise the number of segments will tend to increase forever!
+
+
+
+
+
+
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..e106a33
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,198 @@
+
+import sys, os, os.path
+
+sys.path.append(os.path.abspath("../../src"))
+import whoosh
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.append(os.path.abspath('.'))
+
+# -- General configuration -----------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc',
+ 'sphinx.ext.intersphinx',
+ 'sphinx.ext.todo',
+ 'sphinx.ext.coverage',
+ 'sphinx.ext.ifconfig']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'Whoosh'
+copyright = u'2007-2012 Matt Chaput'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = whoosh.versionstring(build=False)
+# The full version, including alpha/beta/rc tags.
+release = whoosh.versionstring()
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of documents that shouldn't be included in the build.
+#unused_docs = []
+
+# List of directories, relative to source directory, that shouldn't be searched
+# for source files.
+exclude_trees = []
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. Major themes that come with
+# Sphinx are currently 'default' and 'sphinxdoc'.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+html_theme_options = {
+ "codebgcolor": "#CCC",
+ }
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents. If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+#html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_use_modindex = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = ''
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Whooshdoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+# The paper size ('letter' or 'a4').
+#latex_paper_size = 'letter'
+
+# The font size ('10pt', '11pt' or '12pt').
+#latex_font_size = '10pt'
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+ ('index', 'Whoosh.tex', u'Whoosh Documentation',
+ u'Matt Chaput', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# Additional stuff for the LaTeX preamble.
+#latex_preamble = ''
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_use_modindex = True
+
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {'http://docs.python.org/': None}
+
+# Autodoc config
+autoclass_content = "both"
diff --git a/docs/source/dates.rst b/docs/source/dates.rst
new file mode 100644
index 0000000..ab1aadd
--- /dev/null
+++ b/docs/source/dates.rst
@@ -0,0 +1,202 @@
+================================
+Indexing and parsing dates/times
+================================
+
+Indexing dates
+==============
+
+Whoosh lets you index and search dates/times using the
+:class:`whoosh.fields.DATETIME` field type. Instead of passing text for the
+field in ``add_document()``, you use a Python ``datetime.datetime`` object::
+
+ from datetime import datetime, timedelta
+ from whoosh import fields, index
+
+ schema = fields.Schema(title=fields.TEXT, content=fields.TEXT,
+ date=fields.DATETIME)
+ ix = index.create_in("indexdir", schema)
+
+ w = ix.writer()
+ w.add_document(title="Document 1", content="Rendering images from the command line",
+ date=datetime.utcnow())
+ w.add_document(title="Document 2", content="Creating shaders using a node network",
+ date=datetime.utcnow() + timedelta(days=1))
+ w.commit()
+
+
+Parsing date queries
+====================
+
+Once you've have an indexed ``DATETIME`` field, you can search it using a rich
+date parser contained in the :class:`whoosh.qparser.dateparse.DateParserPlugin`::
+
+ from whoosh import index
+ from whoosh.qparser import QueryParser
+ from whoosh.qparser.dateparse import DateParserPlugin
+
+ ix = index.open_dir("indexdir")
+
+ # Instatiate a query parser
+ qp = QueryParser("content", ix.schema)
+
+ # Add the DateParserPlugin to the parser
+ qp.add_plugin(DateParserPlugin())
+
+With the ``DateParserPlugin``, users can use date queries such as::
+
+ 20050912
+ 2005 sept 12th
+ june 23 1978
+ 23 mar 2005
+ july 1985
+ sep 12
+ today
+ yesterday
+ tomorrow
+ now
+ next friday
+ last tuesday
+ 5am
+ 10:25:54
+ 23:12
+ 8 PM
+ 4:46 am oct 31 2010
+ last tuesday to today
+ today to next friday
+ jan 2005 to feb 2008
+ -1 week to now
+ now to +2h
+ -1y6mo to +2 yrs 23d
+
+Normally, as with other types of queries containing spaces, the users need
+to quote date queries containing spaces using single quotes::
+
+ render date:'last tuesday' command
+ date:['last tuesday' to 'next friday']
+
+If you use the ``free`` argument to the ``DateParserPlugin``, the plugin will
+try to parse dates from unquoted text following a date field prefix::
+
+ qp.add_plugin(DateParserPlugin(free=True))
+
+This allows the user to type a date query with spaces and special characters
+following the name of date field and a colon. The date query can be mixed
+with other types of queries without quotes::
+
+ date:last tuesday
+ render date:oct 15th 2001 5:20am command
+
+If you don't use the ``DateParserPlugin``, users can still search DATETIME
+fields using a simple numeric form ``YYYY[MM[DD[hh[mm[ss]]]]]`` that is built
+into the ``DATETIME`` field::
+
+ from whoosh import index
+ from whoosh.qparser import QueryParser
+
+ ix = index.open_dir("indexdir")
+ qp = QueryParser("content", schema=ix.schema)
+
+ # Find all datetimes in 2005
+ q = qp.parse(u"date:2005")
+
+ # Find all datetimes on June 24, 2005
+ q = qp.parse(u"date:20050624")
+
+ # Find all datetimes from 1am-2am on June 24, 2005
+ q = qp.parse(u"date:2005062401")
+
+ # Find all datetimes from Jan 1, 2005 to June 2, 2010
+ q = qp.parse(u"date:[20050101 to 20100602]")
+
+
+About time zones and basetime
+=============================
+
+The best way to deal with time zones is to always index ``datetime``\ s in native
+UTC form. Any ``tzinfo`` attribute on the ``datetime`` object is *ignored*
+by the indexer. If you are working with local datetimes, you should convert them
+to native UTC datetimes before indexing.
+
+
+Date parser notes
+=================
+
+Please note that the date parser is still somewhat experimental.
+
+
+Setting the base datetime
+-------------------------
+
+When you create the ``DateParserPlugin`` you can pass a ``datetime`` object to
+the ``basedate`` argument to set the datetime against which relative queries
+(such as ``last tuesday`` and ``-2 hours``) are measured. By default, the
+basedate is ``datetime.utcnow()`` at the moment the plugin is instantiated::
+
+ qp.add_plugin(DateParserPlugin(basedate=my_datetime))
+
+
+Registering an error callback
+-----------------------------
+
+To avoid user queries causing exceptions in your application, the date parser
+attempts to fail silently when it can't parse a date query. However, you can
+register a callback function to be notified of parsing failures so you can
+display feedback to the user. The argument to the callback function is the
+date text that could not be parsed (this is an experimental feature and may
+change in future versions)::
+
+ errors = []
+ def add_error(msg):
+ errors.append(msg)
+ qp.add_plugin(DateParserPlug(callback=add_error))
+
+ q = qp.parse(u"date:blarg")
+ # errors == [u"blarg"]
+
+
+Using free parsing
+------------------
+
+While the ``free`` option is easier for users, it may result in ambiguities.
+As one example, if you want to find documents containing reference to a march
+and the number 2 in documents from the year 2005, you might type::
+
+ date:2005 march 2
+
+This query would be interpreted correctly as a date query and two term queries
+when ``free=False``, but as a single date query when ``free=True``. In this
+case the user could limit the scope of the date parser with single quotes::
+
+ date:'2005' march 2
+
+
+Parsable formats
+----------------
+
+The date parser supports a wide array of date and time formats, however it is
+not my intention to try to support *all* types of human-readable dates (for
+example ``ten to five the friday after next``). The best idea might be to pick
+a date format that works and try to train users on it, and if they use one of
+the other formats that also works consider it a happy accident.
+
+
+Limitations
+===========
+
+* Since it's based on Python's ``datetime.datetime`` object, the ``DATETIME``
+ field shares all the limitations of that class, such as no support for
+ dates before year 1 on the proleptic Gregorian calendar. The ``DATETIME``
+ field supports practically unlimited dates, so if the ``datetime`` object
+ is every improved it could support it. An alternative possibility might
+ be to add support for ``mxDateTime`` objects someday.
+
+* The ``DateParserPlugin`` currently only has support for English dates.
+ The architecture supports creation of parsers for other languages, and I
+ hope to add examples for other languages soon.
+
+* ``DATETIME`` fields do not currently support open-ended ranges. You can
+ simulate an open ended range by using an endpoint far in the past or future.
+
+
+
+
diff --git a/docs/source/facets.rst b/docs/source/facets.rst
new file mode 100644
index 0000000..277d69a
--- /dev/null
+++ b/docs/source/facets.rst
@@ -0,0 +1,771 @@
+====================
+Sorting and faceting
+====================
+
+.. note::
+ The API for sorting and faceting changed in Whoosh 3.0.
+
+Overview
+========
+
+Sorting and faceting search results in Whoosh is based on **facets**. Each
+facet associates a value with each document in the search results, allowing you
+to sort by the keys or use them to group the documents. Whoosh includes a variety
+of **facet types** you can use for sorting and grouping (see below).
+
+
+Sorting
+=======
+
+By default, the results of a search are sorted with the highest-scoring
+documents first. You can use the ``sortedby`` keyword argument to order the
+results by some other criteria instead, such as the value of a field.
+
+
+Making fields sortable
+----------------------
+
+In order to sort on a field, you should create the field using the
+``sortable=True`` keyword argument::
+
+ schema = fields.Schema(title=fields.TEXT(sortable=True),
+ content=fields.TEXT,
+ modified=fields.DATETIME(sortable=True)
+ )
+
+It's possible to sort on a field that doesn't have ``sortable=True``, but this
+requires Whoosh to load the unique terms in the field into memory. Using
+``sortable`` is much more efficient.
+
+
+About column types
+------------------
+
+When you create a field using ``sortable=True``, you are telling Whoosh to store
+per-document values for that field in a *column*. A column object specifies the
+format to use to store the per-document values on disk.
+
+The :mod:`whoosh.columns` module contains several different column object
+implementations. Each field type specifies a reasonable default column type (for
+example, the default for text fields is :class:`whoosh.columns.VarBytesColumn`,
+the default for numeric fields is :class:`whoosh.columns.NumericColumn`).
+However, if you want maximum efficiency you may want to use a different column
+type for a field.
+
+For example, if all document values in a field are a fixed length, you can use a
+:class:`whoosh.columns.FixedBytesColumn`. If you have a field where many
+documents share a relatively small number of possible values (an example might
+be a "category" field, or "month" or other enumeration type fields), you might
+want to use :class:`whoosh.columns.RefBytesColumn` (which can handle both
+variable and fixed-length values). There are column types for storing
+per-document bit values, structs, pickled objects, and compressed byte values.
+
+To specify a custom column object for a field, pass it as the ``sortable``
+keyword argument instead of ``True``::
+
+ from whoosh import columns, fields
+
+ category_col = columns.RefBytesColumn()
+ schema = fields.Schema(title=fields.TEXT(sortable=True),
+ category=fields.KEYWORD(sortable=category_col)
+
+
+Using a COLUMN field for custom sort keys
+-----------------------------------------
+
+When you add a document with a sortable field, Whoosh uses the value you pass
+for the field as the sortable value. For example, if "title" is a sortable
+field, and you add this document::
+
+ writer.add_document(title="Mr. Palomar")
+
+...then ``Mr. Palomar`` is stored in the field column as the sorting key for the
+document.
+
+This is usually good, but sometimes you need to "massage" the sortable key so
+it's different from the value the user searches and/or sees in the interface.
+For example, if you allow the user to sort by title, you might want to use
+different values for the visible title and the value used for sorting::
+
+ # Visible title
+ title = "The Unbearable Lightness of Being"
+
+ # Sortable title: converted to lowercase (to prevent different ordering
+ # depending on uppercase/lowercase), with initial article moved to the end
+ sort_title = "unbearable lightness of being, the"
+
+The best way to do this is to use an additional field just for sorting. You can
+use the :class:`whoosh.fields.COLUMN` field type to create a field that is not
+indexed or stored, it only holds per-document column values::
+
+ schema = fields.Schema(title=fields.TEXT(stored=True),
+ sort_title=fields.COLUMN(columns.VarBytesColumn())
+ )
+
+The single argument to the :class:`whoosh.fields.COLUMN` initializer is a
+:class:`whoosh.columns.ColumnType` object. You can use any of the various
+column types in the :mod:`whoosh.columns` module.
+
+As another example, say you are indexing documents that have a custom sorting
+order associated with each document, such as a "priority" number::
+
+ name=Big Wheel
+ price=100
+ priority=1
+
+ name=Toss Across
+ price=40
+ priority=3
+
+ name=Slinky
+ price=25
+ priority=2
+ ...
+
+You can use a column field with a numeric column object to hold the "priority"
+and use it for sorting::
+
+ schema = fields.Schema(name=fields.TEXT(stored=True),
+ price=fields.NUMERIC(stored=True),
+ priority=fields.COLUMN(columns.NumericColumn("i"),
+ )
+
+(Note that :class:`columns.NumericColumn` takes a type code character like the
+codes used by Python's ``struct`` and ``array`` modules.)
+
+
+Making existing fields sortable
+-------------------------------
+
+If you have an existing index from before the ``sortable`` argument was added
+in Whoosh 3.0, or you didn't think you needed a field to be sortable but now
+you find that you need to sort it, you can add "sortability" to an existing
+index using the :func:`whoosh.sorting.add_sortable` utility function::
+
+ from whoosh import columns, fields, index, sorting
+
+ # Say we have an existing index with this schema
+ schema = fields.Schema(title=fields.TEXT,
+ price=fields.NUMERIC)
+
+ # To use add_sortable, first open a writer for the index
+ ix = index.open_dir("indexdir")
+ with ix.writer() as w:
+ # Add sortable=True to the "price" field using field terms as the
+ # sortable values
+ sorting.add_sortable(w, "price", sorting.FieldFacet("price"))
+
+ # Add sortable=True to the "title" field using the
+ # stored field values as the sortable value
+ sorting.add_sortable(w, "title", sorting.StoredFieldFacet("title"))
+
+You can specify a custom column type when you call ``add_sortable`` using the
+``column`` keyword argument::
+
+ add_sortable(w, "chapter", sorting.FieldFacet("chapter"),
+ column=columns.RefBytesColumn())
+
+See the documentation for :func:`~whoosh.sorting.add_sortable` for more
+information.
+
+
+Sorting search results
+----------------------
+
+When you tell Whoosh to sort by a field (or fields), it uses the per-document
+values in the field's column as sorting keys for the documents.
+
+Normally search results are sorted by descending relevance score. You can tell
+Whoosh to use a different ordering by passing the ``sortedby`` keyword argument
+to the :meth:`~whoosh.searching.Searcher.search` method::
+
+ from whoosh import fields, index, qparser
+
+ schema = fields.Schema(title=fields.TEXT(stored=True),
+ price=fields.NUMERIC(sortable=True))
+ ix = index.create_in("indexdir", schema)
+
+ with ix.writer() as w:
+ w.add_document(title="Big Deal", price=20)
+ w.add_document(title="Mr. Big", price=10)
+ w.add_document(title="Big Top", price=15)
+
+ with ix.searcher() as s:
+ qp = qparser.QueryParser("big", ix.schema)
+ q = qp.parse(user_query_string)
+
+ # Sort search results from lowest to highest price
+ results = s.search(q, sortedby="price")
+ for hit in results:
+ print(hit["title"])
+
+You can use any of the following objects as ``sortedby`` values:
+
+A ``FacetType`` object
+ Uses this object to sort the documents. See below for the available facet
+ types.
+
+A field name string
+ Converts the field name into a ``FieldFacet`` (see below) and uses it to
+ sort the documents.
+
+A list of ``FacetType`` objects and/or field name strings
+ Bundles the facets together into a ``MultiFacet`` so you can sort by
+ multiple keys. Note that this shortcut does not allow you to reverse
+ the sort direction of individual facets. To do that, you need to construct
+ the ``MultiFacet`` object yourself.
+
+.. note::
+ You can use the ``reverse=True`` keyword argument to the
+ ``Searcher.search()`` method to reverse the overall sort direction. This
+ is more efficient than reversing each individual facet.
+
+
+Examples
+--------
+
+Sort by the value of the size field::
+
+ results = searcher.search(myquery, sortedby="size")
+
+Sort by the reverse (highest-to-lowest) order of the "price" field::
+
+ facet = sorting.FieldFacet("price", reverse=True)
+ results = searcher.search(myquery, sortedby=facet)
+
+Sort by ascending size and then descending price::
+
+ mf = sorting.MultiFacet()
+ mf.add_field("size")
+ mf.add_field("price", reverse=True)
+ results = searcher.search(myquery, sortedby=mf)
+
+ # or...
+ sizes = sorting.FieldFacet("size")
+ prices = sorting.FieldFacet("price", reverse=True)
+ results = searcher.search(myquery, sortedby=[sizes, prices])
+
+Sort by the "category" field, then by the document's score::
+
+ cats = sorting.FieldFacet("category")
+ scores = sorting.ScoreFacet()
+ results = searcher.search(myquery, sortedby=[cats, scores])
+
+
+Accessing column values
+-----------------------
+
+Per-document column values are available in :class:`~whoosh.searching.Hit`
+objects just like stored field values::
+
+ schema = fields.Schema(title=fields.TEXT(stored=True),
+ price=fields.NUMERIC(sortable=True))
+
+ ...
+
+ results = searcher.search(myquery)
+ for hit in results:
+ print(hit["title"], hit["price"])
+
+ADVANCED: if you want to access abitrary per-document values quickly you can get
+a column reader object::
+
+ with ix.searcher() as s:
+ reader = s.reader()
+
+ colreader = s.reader().column_reader("price")
+ for docnum in reader.all_doc_ids():
+ print(colreader[docnum])
+
+
+Grouping
+========
+
+It is often very useful to present "faceted" search results to the user.
+Faceting is dynamic grouping of search results into categories. The
+categories let users view a slice of the total results based on the categories
+they're interested in.
+
+For example, if you are programming a shopping website, you might want to
+display categories with the search results such as the manufacturers and price
+ranges.
+
+==================== =================
+Manufacturer Price
+-------------------- -----------------
+Apple (5) $0 - $100 (2)
+Sanyo (1) $101 - $500 (10)
+Sony (2) $501 - $1000 (1)
+Toshiba (5)
+==================== =================
+
+You can let your users click the different facet values to only show results
+in the given categories.
+
+Another useful UI pattern is to show, say, the top 5 results for different
+types of found documents, and let the user click to see more results from a
+category they're interested in, similarly to how the Spotlight quick results
+work on Mac OS X.
+
+
+The ``groupedby`` keyword argument
+----------------------------------
+
+You can use the following objects as ``groupedby`` values:
+
+A ``FacetType`` object
+ Uses this object to group the documents. See below for the available facet
+ types.
+
+A field name string
+ Converts the field name into a ``FieldFacet`` (see below) and uses it to
+ sort the documents. The name of the field is used as the facet name.
+
+A list or tuple of field name strings
+ Sets up multiple field grouping criteria.
+
+A dictionary mapping facet names to ``FacetType`` objects
+ Sets up multiple grouping criteria.
+
+A ``Facets`` object
+ This object is a lot like using a dictionary, but has some convenience
+ methods to make setting up multiple groupings a little easier.
+
+
+Examples
+--------
+
+Group by the value of the "category" field::
+
+ results = searcher.search(myquery, groupedby="category")
+
+Group by the value of the "category" field and also by the value of the "tags"
+field and a date range::
+
+ cats = sorting.FieldFacet("category")
+ tags = sorting.FieldFacet("tags", allow_overlap=True)
+ results = searcher.search(myquery, groupedby={"category": cats, "tags": tags})
+
+ # ...or, using a Facets object has a little less duplication
+ facets = sorting.Facets()
+ facets.add_field("category")
+ facets.add_field("tags", allow_overlap=True)
+ results = searcher.search(myquery, groupedby=facets)
+
+To group results by the *intersected values of multiple fields*, use a
+``MultiFacet`` object (see below). For example, if you have two fields named
+``tag`` and ``size``, you could group the results by all combinations of the
+``tag`` and ``size`` field, such as ``('tag1', 'small')``,
+``('tag2', 'small')``, ``('tag1', 'medium')``, and so on::
+
+ # Generate a grouping from the combination of the "tag" and "size" fields
+ mf = MultiFacet("tag", "size")
+ results = searcher.search(myquery, groupedby={"tag/size": mf})
+
+
+Getting the faceted groups
+--------------------------
+
+The ``Results.groups("facetname")`` method returns a dictionary mapping
+category names to lists of **document IDs**::
+
+ myfacets = sorting.Facets().add_field("size").add_field("tag")
+ results = mysearcher.search(myquery, groupedby=myfacets)
+ results.groups("size")
+ # {"small": [8, 5, 1, 2, 4], "medium": [3, 0, 6], "large": [7, 9]}
+
+If there is only one facet, you can just use ``Results.groups()`` with no
+argument to access its groups::
+
+ results = mysearcher.search(myquery, groupedby=myfunctionfacet)
+ results.groups()
+
+By default, the values in the dictionary returned by ``groups()`` are lists of
+document numbers in the same relative order as in the results. You can use the
+``Searcher`` object's ``stored_fields()`` method to take a document number and
+return the document's stored fields as a dictionary::
+
+ for category_name in categories:
+ print "Top 5 documents in the %s category" % category_name
+ doclist = categories[category_name]
+ for docnum, score in doclist[:5]:
+ print " ", searcher.stored_fields(docnum)
+ if len(doclist) > 5:
+ print " (%s more)" % (len(doclist) - 5)
+
+If you want different information about the groups, for example just the count
+of documents in each group, or you don't need the groups to be ordered, you can
+specify a :class:`whoosh.sorting.FacetMap` type or instance with the
+``maptype`` keyword argument when creating the ``FacetType``::
+
+ # This is the same as the default
+ myfacet = FieldFacet("size", maptype=sorting.OrderedList)
+ results = mysearcher.search(myquery, groupedby=myfacet)
+ results.groups()
+ # {"small": [8, 5, 1, 2, 4], "medium": [3, 0, 6], "large": [7, 9]}
+
+ # Don't sort the groups to match the order of documents in the results
+ # (faster)
+ myfacet = FieldFacet("size", maptype=sorting.UnorderedList)
+ results = mysearcher.search(myquery, groupedby=myfacet)
+ results.groups()
+ # {"small": [1, 2, 4, 5, 8], "medium": [0, 3, 6], "large": [7, 9]}
+
+ # Only count the documents in each group
+ myfacet = FieldFacet("size", maptype=sorting.Count)
+ results = mysearcher.search(myquery, groupedby=myfacet)
+ results.groups()
+ # {"small": 5, "medium": 3, "large": 2}
+
+ # Only remember the "best" document in each group
+ myfacet = FieldFacet("size", maptype=sorting.Best)
+ results = mysearcher.search(myquery, groupedby=myfacet)
+ results.groups()
+ # {"small": 8, "medium": 3, "large": 7}
+
+Alternatively you can specify a ``maptype`` argument in the
+``Searcher.search()`` method call which applies to all facets::
+
+ results = mysearcher.search(myquery, groupedby=["size", "tag"],
+ maptype=sorting.Count)
+
+(You can override this overall ``maptype`` argument on individual facets by
+specifying the ``maptype`` argument for them as well.)
+
+
+Facet types
+===========
+
+FieldFacet
+----------
+
+This is the most common facet type. It sorts or groups based on the
+value in a certain field in each document. This generally works best
+(or at all) if each document has only one term in the field (e.g. an ID
+field)::
+
+ # Sort search results by the value of the "path" field
+ facet = sorting.FieldFacet("path")
+ results = searcher.search(myquery, sortedby=facet)
+
+ # Group search results by the value of the "parent" field
+ facet = sorting.FieldFacet("parent")
+ results = searcher.search(myquery, groupedby=facet)
+ parent_groups = results.groups("parent")
+
+By default, ``FieldFacet`` only supports **non-overlapping** grouping, where a
+document cannot belong to multiple facets at the same time (each document will
+be sorted into one category arbitrarily.) To get overlapping groups with
+multi-valued fields, use the ``allow_overlap=True`` keyword argument::
+
+ facet = sorting.FieldFacet(fieldname, allow_overlap=True)
+
+This supports overlapping group membership where documents have more than one
+term in a field (e.g. KEYWORD fields). If you don't need overlapping, don't
+use ``allow_overlap`` because it's *much* slower and uses more memory (see
+the secion on ``allow_overlap`` below).
+
+
+QueryFacet
+----------
+
+You can set up categories defined by arbitrary queries. For example, you can
+group names using prefix queries::
+
+ # Use queries to define each category
+ # (Here I'll assume "price" is a NUMERIC field, so I'll use
+ # NumericRange)
+ qdict = {}
+ qdict["A-D"] = query.TermRange("name", "a", "d")
+ qdict["E-H"] = query.TermRange("name", "e", "h")
+ qdict["I-L"] = query.TermRange("name", "i", "l")
+ # ...
+
+ qfacet = sorting.QueryFacet(qdict)
+ r = searcher.search(myquery, groupedby={"firstltr": qfacet})
+
+By default, ``QueryFacet`` only supports **non-overlapping** grouping, where a
+document cannot belong to multiple facets at the same time (each document will
+be sorted into one category arbitrarily). To get overlapping groups with
+multi-valued fields, use the ``allow_overlap=True`` keyword argument::
+
+ facet = sorting.QueryFacet(querydict, allow_overlap=True)
+
+
+RangeFacet
+----------
+
+The ``RangeFacet`` is for NUMERIC field types. It divides a range of possible
+values into groups. For example, to group documents based on price into
+buckets $100 "wide"::
+
+ pricefacet = sorting.RangeFacet("price", 0, 1000, 100)
+
+The first argument is the name of the field. The next two arguments are the
+full range to be divided. Value outside this range (in this example, values
+below 0 and above 1000) will be sorted into the "missing" (None) group. The
+fourth argument is the "gap size", the size of the divisions in the range.
+
+The "gap" can be a list instead of a single value. In that case, the values in
+the list will be used to set the size of the initial divisions, with the last
+value in the list being the size for all subsequent divisions. For example::
+
+ pricefacet = sorting.RangeFacet("price", 0, 1000, [5, 10, 35, 50])
+
+...will set up divisions of 0-5, 5-15, 15-50, 50-100, and then use 50 as the
+size for all subsequent divisions (i.e. 100-150, 150-200, and so on).
+
+The ``hardend`` keyword argument controls whether the last division is clamped
+to the end of the range or allowed to go past the end of the range. For
+example, this::
+
+ facet = sorting.RangeFacet("num", 0, 10, 4, hardend=False)
+
+...gives divisions 0-4, 4-8, and 8-12, while this::
+
+ facet = sorting.RangeFacet("num", 0, 10, 4, hardend=True)
+
+...gives divisions 0-4, 4-8, and 8-10. (The default is ``hardend=False``.)
+
+.. note::
+ The ranges/buckets are always **inclusive** at the start and **exclusive**
+ at the end.
+
+
+DateRangeFacet
+--------------
+
+This is like ``RangeFacet`` but for DATETIME fields. The start and end values
+must be ``datetime.datetime`` objects, and the gap(s) is/are
+``datetime.timedelta`` objects.
+
+For example::
+
+ from datetime import datetime, timedelta
+
+ start = datetime(2000, 1, 1)
+ end = datetime.now()
+ gap = timedelta(days=365)
+ bdayfacet = sorting.DateRangeFacet("birthday", start, end, gap)
+
+As with ``RangeFacet``, you can use a list of gaps and the ``hardend`` keyword
+argument.
+
+
+ScoreFacet
+----------
+
+This facet is sometimes useful for sorting.
+
+For example, to sort by the "category" field, then for documents with the same
+category, sort by the document's score::
+
+ cats = sorting.FieldFacet("category")
+ scores = sorting.ScoreFacet()
+ results = searcher.search(myquery, sortedby=[cats, scores])
+
+The ``ScoreFacet`` always sorts higher scores before lower scores.
+
+.. note::
+ While using ``sortedby=ScoreFacet()`` should give the same results as using
+ the default scored ordering (``sortedby=None``), using the facet will be
+ slower because Whoosh automatically turns off many optimizations when
+ sorting.
+
+
+FunctionFacet
+-------------
+
+This facet lets you pass a custom function to compute the sorting/grouping key
+for documents. (Using this facet type may be easier than subclassing FacetType
+and Categorizer to set up some custom behavior.)
+
+The function will be called with the index searcher and index document ID as
+arguments. For example, if you have an index with term vectors::
+
+ schema = fields.Schema(id=fields.STORED,
+ text=fields.TEXT(stored=True, vector=True))
+ ix = RamStorage().create_index(schema)
+
+...you could use a function to sort documents higher the closer they are to
+having equal occurances of two terms::
+
+ def fn(searcher, docnum):
+ v = dict(searcher.vector_as("frequency", docnum, "text"))
+ # Sort documents that have equal number of "alfa" and "bravo" first
+ return 0 - (1.0 / (abs(v.get("alfa", 0) - v.get("bravo", 0)) + 1.0))
+
+ facet = sorting.FunctionFacet(fn)
+ results = searcher.search(myquery, sortedby=facet)
+
+
+StoredFieldFacet
+----------------
+
+This facet lets you use stored field values as the sorting/grouping key for
+documents. This is usually slower than using an indexed field, but when using
+``allow_overlap`` it can actually be faster for large indexes just because it
+avoids the overhead of reading posting lists.
+
+:class:`~whoosh.sorting.StoredFieldFacet` supports ``allow_overlap`` by
+splitting the stored value into separate keys. By default it calls the value's
+``split()`` method (since most stored values are strings), but you can supply
+a custom split function. See the section on ``allow_overlap`` below.
+
+
+MultiFacet
+==========
+
+This facet type returns a composite of the keys returned by two or more
+sub-facets, allowing you to sort/group by the intersected values of multiple
+facets.
+
+``MultiFacet`` has methods for adding facets::
+
+ myfacet = sorting.RangeFacet(0, 1000, 10)
+
+ mf = sorting.MultiFacet()
+ mf.add_field("category")
+ mf.add_field("price", reverse=True)
+ mf.add_facet(myfacet)
+ mf.add_score()
+
+You can also pass a list of field names and/or ``FacetType`` objects to the
+initializer::
+
+ prices = sorting.FieldFacet("price", reverse=True)
+ scores = sorting.ScoreFacet()
+ mf = sorting.MultiFacet("category", prices, myfacet, scores)
+
+
+Missing values
+==============
+
+* When sorting, documents without any terms in a given field, or whatever else
+ constitutes "missing" for different facet types, will always sort to the end.
+
+* When grouping, "missing" documents will appear in a group with the
+ key ``None``.
+
+
+Using overlapping groups
+========================
+
+The common supported workflow for grouping and sorting is where the given field
+has *one value for document*, for example a ``path`` field containing the file
+path of the original document. By default, facets are set up to support this
+single-value approach.
+
+Of course, there are situations where you want documents to be sorted into
+multiple groups based on a field with multiple terms per document. The most
+common example would be a ``tags`` field. The ``allow_overlap`` keyword
+argument to the :class:`~whoosh.sorting.FieldFacet`,
+:class:`~whoosh.sorting.QueryFacet`, and
+:class:`~whoosh.sorting.StoredFieldFacet` allows this multi-value approach.
+
+However, there is an important caveat: using ``allow_overlap=True`` is slower
+than the default, potentially *much* slower for very large result sets. This is
+because Whoosh must read every posting of every term in the field to
+create a temporary "forward index" mapping documents to terms.
+
+If a field is indexed with *term vectors*, ``FieldFacet`` will use them to
+speed up ``allow_overlap`` faceting for small result sets, but for large result
+sets, where Whoosh has to open the vector list for every matched document, this
+can still be very slow.
+
+For very large indexes and result sets, if a field is stored, you can get
+faster overlapped faceting using :class:`~whoosh.sorting.StoredFieldFacet`
+instead of ``FieldFacet``. While reading stored values is usually slower than
+using the index, in this case avoiding the overhead of opening large numbers of
+posting readers can make it worthwhile.
+
+``StoredFieldFacet`` supports ``allow_overlap`` by loading the stored value for
+the given field and splitting it into multiple values. The default is to call
+the value's ``split()`` method.
+
+For example, if you've stored the ``tags`` field as a string like
+``"tag1 tag2 tag3"``::
+
+ schema = fields.Schema(name=fields.TEXT(stored=True),
+ tags=fields.KEYWORD(stored=True))
+ ix = index.create_in("indexdir")
+ with ix.writer() as w:
+ w.add_document(name="A Midsummer Night's Dream", tags="comedy fairies")
+ w.add_document(name="Hamlet", tags="tragedy denmark")
+ # etc.
+
+...Then you can use a ``StoredFieldFacet`` like this::
+
+ ix = index.open_dir("indexdir")
+ with ix.searcher() as s:
+ sff = sorting.StoredFieldFacet("tags", allow_overlap=True)
+ results = s.search(myquery, groupedby={"tags": sff})
+
+For stored Python objects other than strings, you can supply a split function
+(using the ``split_fn`` keyword argument to ``StoredFieldFacet``). The function
+should accept a single argument (the stored value) and return a list or tuple
+of grouping keys.
+
+
+Using a custom sort order
+=========================
+
+It is sometimes useful to have a custom sort order per-search. For example,
+different languages use different sort orders. If you have a function to return
+the sorting order you want for a given field value, such as an implementation of
+the Unicode Collation Algorithm (UCA), you can customize the sort order
+for the user's language.
+
+The :class:`whoosh.sorting.TranslateFacet` lets you apply a function to the
+value of another facet. This lets you "translate" a field value into an
+arbitrary sort key, such as with UCA::
+
+ from pyuca import Collator
+
+ # The Collator object has a sort_key() method which takes a unicode
+ # string and returns a sort key
+ c = Collator("allkeys.txt")
+
+ # Make a facet object for the field you want to sort on
+ nf = sorting.FieldFacet("name")
+
+ # Wrap the facet in a TranslateFacet with the translation function
+ # (the Collator object's sort_key method)
+ tf = sorting.TranslateFacet(facet, c.sort_key)
+
+ # Use the facet to sort the search results
+ results = searcher.search(myquery, sortedby=tf)
+
+(You can pass multiple "wrapped" facets to the ``TranslateFacet``, and it will
+call the function with the values of the facets as multiple arguments.)
+
+The TranslateFacet can also be very useful with numeric fields to sort on the
+output of some formula::
+
+ # Sort based on the average of two numeric fields
+ def average(a, b):
+ return (a + b) / 2.0
+
+ # Create two facets for the fields and pass them with the function to
+ # TranslateFacet
+ af = sorting.FieldFacet("age")
+ wf = sorting.FieldFacet("weight")
+ facet = sorting.TranslateFacet(average, af, wf)
+
+ results = searcher.search(myquery. sortedby=facet)
+
+Remember that you can still sort by multiple facets. For example, you could sort
+by a numeric value transformed by a quantizing function first, and then if that
+is equal sort by the value of another field::
+
+ # Sort by a quantized size first, then by name
+ tf = sorting.TranslateFacet(quantize, sorting.FieldFacet("size"))
+ results = searcher.search(myquery, sortedby=[tf, "name"])
+
+
+Expert: writing your own facet
+==============================
+
+TBD.
+
+
diff --git a/docs/source/fieldcaches.rst b/docs/source/fieldcaches.rst
new file mode 100644
index 0000000..49091dc
--- /dev/null
+++ b/docs/source/fieldcaches.rst
@@ -0,0 +1,52 @@
+============
+Field caches
+============
+
+The default (``filedb``) backend uses *field caches* in certain circumstances.
+The field cache basically pre-computes the order of documents in the index to
+speed up sorting and faceting.
+
+Generating field caches can take time the first time you sort/facet on a large
+index. The field cache is kept in memory (and by default written to disk when it
+is generated) so subsequent sorted/faceted searches should be faster.
+
+The default caching policy never expires field caches, so reused searchers and/or
+sorting a lot of different fields could use up quite a bit of memory with large
+indexes.
+
+
+Customizing cache behaviour
+===========================
+
+(The following API examples refer to the default ``filedb`` backend.)
+
+*By default*, Whoosh saves field caches to disk. To prevent a reader or searcher
+from writing out field caches, do this before you start using it::
+
+ searcher.set_caching_policy(save=False)
+
+By default, if caches are written to disk they are saved in the index directory.
+To tell a reader or searcher to save cache files to a different location, create
+a storage object and pass it to the ``storage`` keyword argument::
+
+ from whoosh.filedb.filestore import FileStorage
+
+ mystorage = FileStorage("path/to/cachedir")
+ reader.set_caching_policy(storage=mystorage)
+
+
+Creating a custom caching policy
+================================
+
+Expert users who want to implement a custom caching policy (for example, to add
+cache expiration) should subclass :class:`whoosh.filedb.fieldcache.FieldCachingPolicy`.
+Then you can pass an instance of your policy object to the ``set_caching_policy``
+method::
+
+ searcher.set_caching_policy(MyPolicy())
+
+
+
+
+
+
diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst
new file mode 100644
index 0000000..e9dd52d
--- /dev/null
+++ b/docs/source/glossary.rst
@@ -0,0 +1,65 @@
+.. _glossary:
+
+========
+Glossary
+========
+
+.. glossary::
+
+ Analysis
+ The process of breaking the text of a field into individual *terms*
+ to be indexed. This consists of tokenizing the text into terms, and then optionally
+ filtering the tokenized terms (for example, lowercasing and removing *stop words*).
+ Whoosh includes several different analyzers.
+
+ Corpus
+ The set of documents you are indexing.
+
+ Documents
+ The individual pieces of content you want to make searchable.
+ The word "documents" might imply files, but the data source could really be
+ anything -- articles in a content management system, blog posts in a blogging
+ system, chunks of a very large file, rows returned from an SQL query, individual
+ email messages from a mailbox file, or whatever. When you get search results
+ from Whoosh, the results are a list of documents, whatever "documents" means in
+ your search engine.
+
+ Fields
+ Each document contains a set of fields. Typical fields might be "title", "content",
+ "url", "keywords", "status", "date", etc. Fields can be indexed (so they're
+ searchable) and/or stored with the document. Storing the field makes it available
+ in search results. For example, you typically want to store the "title" field so
+ your search results can display it.
+
+ Forward index
+ A table listing every document and the words that appear in the document.
+ Whoosh lets you store *term vectors* that are a kind of forward index.
+
+ Indexing
+ The process of examining documents in the corpus and adding them to the
+ *reverse index*.
+
+ Postings
+ The *reverse index* lists every word in the corpus, and for each word, a list
+ of documents in which that word appears, along with some optional information
+ (such as the number of times the word appears in that document). These items
+ in the list, containing a document number and any extra information, are
+ called *postings*. In Whoosh the information stored in postings is customizable
+ for each *field*.
+
+ Reverse index
+ Basically a table listing every word in the corpus, and for each word, the
+ list of documents in which it appears. It can be more complicated (the index can
+ also list how many times the word appears in each document, the positions at which
+ it appears, etc.) but that's how it basically works.
+
+ Schema
+ Whoosh requires that you specify the *fields* of the index before you begin
+ indexing. The Schema associates field names with metadata about the field, such
+ as the format of the *postings* and whether the contents of the field are stored
+ in the index.
+
+ Term vector
+ A *forward index* for a certain field in a certain document. You can specify
+ in the Schema that a given field should store term vectors.
+
diff --git a/docs/source/highlight.rst b/docs/source/highlight.rst
new file mode 100644
index 0000000..5244c5f
--- /dev/null
+++ b/docs/source/highlight.rst
@@ -0,0 +1,419 @@
+================================================
+How to create highlighted search result excerpts
+================================================
+
+Overview
+========
+
+The highlighting system works as a pipeline, with four component types.
+
+* **Fragmenters** chop up the original text into __fragments__, based on the
+ locations of matched terms in the text.
+
+* **Scorers** assign a score to each fragment, allowing the system to rank the
+ best fragments by whatever criterion.
+
+* **Order functions** control in what order the top-scoring fragments are
+ presented to the user. For example, you can show the fragments in the order
+ they appear in the document (FIRST) or show higher-scoring fragments first
+ (SCORE)
+
+* **Formatters** turn the fragment objects into human-readable output, such as
+ an HTML string.
+
+
+Requirements
+============
+
+Highlighting requires that you have the text of the indexed document available.
+You can keep the text in a stored field, or if the original text is available
+in a file, database column, etc, just reload it on the fly. Note that you might
+need to process the text to remove e.g. HTML tags, wiki markup, etc.
+
+
+How to
+======
+
+Get search results::
+
+ results = mysearcher.search(myquery)
+ for hit in results:
+ print(hit["title"])
+
+You can use the :meth:`~whoosh.searching.Hit.highlights` method on the
+:class:`whoosh.searching.Hit` object to get highlighted snippets from the
+document containing the search terms.
+
+The first argument is the name of the field to highlight. If the field is
+stored, this is the only argument you need to supply::
+
+ results = mysearcher.search(myquery)
+ for hit in results:
+ print(hit["title"])
+ # Assume "content" field is stored
+ print(hit.highlights("content"))
+
+If the field is not stored, you need to retrieve the text of the field some
+other way. For example, reading it from the original file or a database. Then
+you can supply the text to highlight with the ``text`` argument::
+
+ results = mysearcher.search(myquery)
+ for hit in results:
+ print(hit["title"])
+
+ # Assume the "path" stored field contains a path to the original file
+ with open(hit["path"]) as fileobj:
+ filecontents = fileobj.read()
+
+ print(hit.highlights("content", text=filecontents))
+
+
+The character limit
+===================
+
+By default, Whoosh only pulls fragments from the first 32K characters of the
+text. This prevents very long texts from bogging down the highlighting process
+too much, and is usually justified since important/summary information is
+usually at the start of a document. However, if you find the highlights are
+missing information (for example, very long encyclopedia articles where the
+terms appear in a later section), you can increase the fragmenter's character
+limit.
+
+You can change the character limit on the results object like this::
+
+ results = mysearcher.search(myquery)
+ results.fragmenter.charlimit = 100000
+
+To turn off the character limit::
+
+ results.fragmenter.charlimit = None
+
+If you instantiate a custom fragmenter, you can set the character limit on it
+directly::
+
+ sf = highlight.SentenceFragmenter(charlimit=100000)
+ results.fragmenter = sf
+
+See below for information on customizing the highlights.
+
+If you increase or disable the character limit to highlight long documents, you
+may need to use the tips in the "speeding up highlighting" section below to
+make highlighting faster.
+
+
+Customizing the highlights
+==========================
+
+Number of fragments
+-------------------
+
+You can use the ``top`` keyword argument to control the number of fragments
+returned in each snippet::
+
+ # Show a maximum of 5 fragments from the document
+ print hit.highlights("content", top=5)
+
+
+Fragment size
+-------------
+
+The default fragmenter has a ``maxchars`` attribute (default 200) controlling
+the maximum length of a fragment, and a ``surround`` attribute (default 20)
+controlling the maximum number of characters of context to add at the beginning
+and end of a fragment::
+
+ # Allow larger fragments
+ results.fragmenter.maxchars = 300
+
+ # Show more context before and after
+ results.fragmenter.surround = 50
+
+
+Fragmenter
+----------
+
+A fragmenter controls how to extract excerpts from the original text.
+
+The ``highlight`` module has the following pre-made fragmenters:
+
+:class:`whoosh.highlight.ContextFragmenter` (the default)
+ This is a "smart" fragmenter that finds matched terms and then pulls
+ in surround text to form fragments. This fragmenter only yields
+ fragments that contain matched terms.
+
+:class:`whoosh.highlight.SentenceFragmenter`
+ Tries to break the text into fragments based on sentence punctuation
+ (".", "!", and "?"). This object works by looking in the original
+ text for a sentence end as the next character after each token's
+ 'endchar'. Can be fooled by e.g. source code, decimals, etc.
+
+:class:`whoosh.highlight.WholeFragmenter`
+ Returns the entire text as one "fragment". This can be useful if you
+ are highlighting a short bit of text and don't need to fragment it.
+
+The different fragmenters have different options. For example, the default
+:class:`~whoosh.highlight.ContextFragmenter` lets you set the maximum
+fragment size and the size of the context to add on either side::
+
+ my_cf = highlight.ContextFragmenter(maxchars=100, surround=30)
+
+See the :mod:`whoosh.highlight` docs for more information.
+
+To use a different fragmenter::
+
+ results.fragmenter = my_cf
+
+
+Scorer
+------
+
+A scorer is a callable that takes a :class:`whoosh.highlight.Fragment` object and
+returns a sortable value (where higher values represent better fragments).
+The default scorer adds up the number of matched terms in the fragment, and
+adds a "bonus" for the number of __different__ matched terms. The highlighting
+system uses this score to select the best fragments to show to the user.
+
+As an example of a custom scorer, to rank fragments by lowest standard
+deviation of the positions of matched terms in the fragment::
+
+ def StandardDeviationScorer(fragment):
+ """Gives higher scores to fragments where the matched terms are close
+ together.
+ """
+
+ # Since lower values are better in this case, we need to negate the
+ # value
+ return 0 - stddev([t.pos for t in fragment.matched])
+
+To use a different scorer::
+
+ results.scorer = StandardDeviationScorer
+
+
+Order
+-----
+
+The order is a function that takes a fragment and returns a sortable value used
+to sort the highest-scoring fragments before presenting them to the user (where
+fragments with lower values appear before fragments with higher values).
+
+The ``highlight`` module has the following order functions.
+
+``FIRST`` (the default)
+ Show fragments in the order they appear in the document.
+
+``SCORE``
+ Show highest scoring fragments first.
+
+The ``highlight`` module also includes ``LONGER`` (longer fragments first) and
+``SHORTER`` (shorter fragments first), but they probably aren't as generally
+useful.
+
+To use a different order::
+
+ results.order = highlight.SCORE
+
+
+Formatter
+---------
+
+A formatter contols how the highest scoring fragments are turned into a
+formatted bit of text for display to the user. It can return anything
+(e.g. plain text, HTML, a Genshi event stream, a SAX event generator,
+or anything else useful to the calling system).
+
+The ``highlight`` module contains the following pre-made formatters.
+
+:class:`whoosh.highlight.HtmlFormatter`
+ Outputs a string containing HTML tags (with a class attribute)
+ around the matched terms.
+
+:class:`whoosh.highlight.UppercaseFormatter`
+ Converts the matched terms to UPPERCASE.
+
+:class:`whoosh.highlight.GenshiFormatter`
+ Outputs a Genshi event stream, with the matched terms wrapped in a
+ configurable element.
+
+The easiest way to create a custom formatter is to subclass
+``highlight.Formatter`` and override the ``format_token`` method::
+
+ class BracketFormatter(highlight.Formatter):
+ """Puts square brackets around the matched terms.
+ """
+
+ def format_token(self, text, token, replace=False):
+ # Use the get_text function to get the text corresponding to the
+ # token
+ tokentext = highlight.get_text(text, token)
+
+ # Return the text as you want it to appear in the highlighted
+ # string
+ return "[%s]" % tokentext
+
+To use a different formatter::
+
+ brf = BracketFormatter()
+ results.formatter = brf
+
+If you need more control over the formatting (or want to output something other
+than strings), you will need to override other methods. See the documentation
+for the :class:`whoosh.highlight.Formatter` class.
+
+
+Highlighter object
+==================
+
+Rather than setting attributes on the results object, you can create a
+reusable :class:`whoosh.highlight.Highlighter` object. Keyword arguments let
+you change the ``fragmenter``, ``scorer``, ``order``, and/or ``formatter``::
+
+ hi = highlight.Highlighter(fragmenter=my_cf, scorer=sds)
+
+You can then use the :meth:`whoosh.highlight.Highlighter.highlight_hit` method
+to get highlights for a ``Hit`` object::
+
+ for hit in results:
+ print(hit["title"])
+ print(hi.highlight_hit(hit))
+
+(When you assign to a ``Results`` object's ``fragmenter``, ``scorer``, ``order``,
+or ``formatter`` attributes, you're actually changing the values on the
+results object's default ``Highlighter`` object.)
+
+
+Speeding up highlighting
+========================
+
+Recording which terms matched in which documents during the search may make
+highlighting faster, since it will skip documents it knows don't contain any
+matching terms in the given field::
+
+ # Record per-document term matches
+ results = searcher.search(myquery, terms=True)
+
+
+PinpointFragmenter
+------------------
+
+Usually the highlighting system uses the field's analyzer to re-tokenize the
+document's text to find the matching terms in context. If you have long
+documents and have increased/disabled the character limit, and/or if the field
+has a very complex analyzer, re-tokenizing may be slow.
+
+Instead of retokenizing, Whoosh can look up the character positions of the
+matched terms in the index. Looking up the character positions is not
+instantaneous, but is usually faster than analyzing large amounts of text.
+
+To use :class:`whoosh.highlight.PinpointFragmenter` and avoid re-tokenizing the
+document text, you must do all of the following:
+
+Index the field with character information (this will require re-indexing an
+existing index)::
+
+ # Index the start and end chars of each term
+ schema = fields.Schema(content=fields.TEXT(stored=True, chars=True))
+
+Record per-document term matches in the results::
+
+ # Record per-document term matches
+ results = searcher.search(myquery, terms=True)
+
+Set a :class:`whoosh.highlight.PinpointFragmenter` as the fragmenter::
+
+ results.fragmenter = highlight.PinpointFragmenter()
+
+
+PinpointFragmenter limitations
+------------------------------
+
+When the highlighting system does not re-tokenize the text, it doesn't know
+where any other words are in the text except the matched terms it looked up in
+the index. Therefore when the fragmenter adds surrounding context, it just adds
+or a certain number of characters blindly, and so doesn't distinguish between
+content and whitespace, or break on word boundaries, for example::
+
+ >>> hit.highlights("content")
+ 're when the <b>fragmenter</b>\n ad'
+
+(This can be embarassing when the word fragments form dirty words!)
+
+One way to avoid this is to not show any surrounding context, but then
+fragments containing one matched term will contain ONLY that matched term::
+
+ >>> hit.highlights("content")
+ '<b>fragmenter</b>'
+
+Alternatively, you can normalize whitespace in the text before passing it to
+the highlighting system::
+
+ >>> text = searcher.stored_
+ >>> re.sub("[\t\r\n ]+", " ", text)
+ >>> hit.highlights("content", text=text)
+
+...and use the ``autotrim`` option of ``PinpointFragmenter`` to automatically
+strip text before the first space and after the last space in the fragments::
+
+ >>> results.fragmenter = highlight.PinpointFragmenter(autotrim=True)
+ >>> hit.highlights("content")
+ 'when the <b>fragmenter</b>'
+
+
+Using the low-level API
+=======================
+
+Usage
+-----
+
+The following function lets you retokenize and highlight a piece of text using
+an analyzer::
+
+ from whoosh.highlight import highlight
+
+ excerpts = highlight(text, terms, analyzer, fragmenter, formatter, top=3,
+ scorer=BasicFragmentScorer, minscore=1, order=FIRST)
+
+``text``
+ The original text of the document.
+
+``terms``
+ A sequence or set containing the query words to match, e.g. ("render",
+ "shader").
+
+``analyzer``
+ The analyzer to use to break the document text into tokens for matching
+ against the query terms. This is usually the analyzer for the field the
+ query terms are in.
+
+``fragmenter``
+ A :class:`whoosh.highlight.Fragmenter` object, see below.
+
+``formatter``
+ A :class:`whoosh.highlight.Formatter` object, see below.
+
+``top``
+ The number of fragments to include in the output.
+
+``scorer``
+ A :class:`whoosh.highlight.FragmentScorer` object. The only scorer currently
+ included with Whoosh is :class:`~whoosh.highlight.BasicFragmentScorer`, the
+ default.
+
+``minscore``
+ The minimum score a fragment must have to be considered for inclusion.
+
+``order``
+ An ordering function that determines the order of the "top" fragments in the
+ output text.
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..213f8cc
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,50 @@
+==============================
+Whoosh |release| documentation
+==============================
+
+Whoosh was created by `Matt Chaput <mailto:matt@whoosh.ca>`_.
+You can view outstanding issues on the
+`Whoosh Bitbucket page <http://bitbucket.org/mchaput/whoosh>`_
+and get help on the `Whoosh mailing list <http://groups.google.com/group/whoosh>`_.
+
+
+Contents
+========
+
+.. toctree::
+ :maxdepth: 2
+
+ releases/index
+ quickstart
+ intro
+ glossary
+ schema
+ indexing
+ searching
+ parsing
+ querylang
+ dates
+ query
+ analysis
+ stemming
+ ngrams
+ facets
+ highlight
+ keywords
+ spelling
+ fieldcaches
+ batch
+ threads
+ nested
+ recipes
+ api/api
+ tech/index
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/docs/source/indexing.rst b/docs/source/indexing.rst
new file mode 100644
index 0000000..e8278df
--- /dev/null
+++ b/docs/source/indexing.rst
@@ -0,0 +1,440 @@
+======================
+How to index documents
+======================
+
+Creating an Index object
+========================
+
+To create an index in a directory, use ``index.create_in``::
+
+ import os, os.path
+ from whoosh import index
+
+ if not os.path.exists("indexdir"):
+ os.mkdir("indexdir")
+
+ ix = index.create_in("indexdir", schema)
+
+To open an existing index in a directory, use ``index.open_dir``::
+
+ import whoosh.index as index
+
+ ix = index.open_dir("indexdir")
+
+These are convenience methods for::
+
+ from whoosh.filedb.filestore import FileStorage
+ storage = FileStorage("indexdir")
+
+ # Create an index
+ ix = storage.create_index(schema)
+
+ # Open an existing index
+ storage.open_index()
+
+The schema you created the index with is pickled and stored with the index.
+
+You can keep multiple indexes in the same directory using the indexname keyword
+argument::
+
+ # Using the convenience functions
+ ix = index.create_in("indexdir", schema=schema, indexname="usages")
+ ix = index.open_dir("indexdir", indexname="usages")
+
+ # Using the Storage object
+ ix = storage.create_index(schema, indexname="usages")
+ ix = storage.open_index(indexname="usages")
+
+
+Clearing the index
+==================
+
+Calling ``index.create_in`` on a directory with an existing index will clear the
+current contents of the index.
+
+To test whether a directory currently contains a valid index, use
+``index.exists_in``::
+
+ exists = index.exists_in("indexdir")
+ usages_exists = index.exists_in("indexdir", indexname="usages")
+
+(Alternatively you can simply delete the index's files from the directory, e.g.
+if you only have one index in the directory, use ``shutil.rmtree`` to remove the
+directory and then recreate it.)
+
+
+Indexing documents
+==================
+
+Once you've created an ``Index`` object, you can add documents to the index with an
+``IndexWriter`` object. The easiest way to get the ``IndexWriter`` is to call
+``Index.writer()``::
+
+ ix = index.open_dir("index")
+ writer = ix.writer()
+
+Creating a writer locks the index for writing, so only one thread/process at
+a time can have a writer open.
+
+.. note::
+
+ Because opening a writer locks the index for writing, in a multi-threaded
+ or multi-process environment your code needs to be aware that opening a
+ writer may raise an exception (``whoosh.store.LockError``) if a writer is
+ already open. Whoosh includes a couple of example implementations
+ (:class:`whoosh.writing.AsyncWriter` and
+ :class:`whoosh.writing.BufferedWriter`) of ways to work around the write
+ lock.
+
+.. note::
+
+ While the writer is open and during the commit, the index is still
+ available for reading. Existing readers are unaffected and new readers can
+ open the current index normally. Once the commit is finished, existing
+ readers continue to see the previous version of the index (that is, they
+ do not automatically see the newly committed changes). New readers will see
+ the updated index.
+
+The IndexWriter's ``add_document(**kwargs)`` method accepts keyword arguments
+where the field name is mapped to a value::
+
+ writer = ix.writer()
+ writer.add_document(title=u"My document", content=u"This is my document!",
+ path=u"/a", tags=u"first short", icon=u"/icons/star.png")
+ writer.add_document(title=u"Second try", content=u"This is the second example.",
+ path=u"/b", tags=u"second short", icon=u"/icons/sheep.png")
+ writer.add_document(title=u"Third time's the charm", content=u"Examples are many.",
+ path=u"/c", tags=u"short", icon=u"/icons/book.png")
+ writer.commit()
+
+You don't have to fill in a value for every field. Whoosh doesn't care if you
+leave out a field from a document.
+
+Indexed fields must be passed a unicode value. Fields that are stored but not
+indexed (i.e. the ``STORED`` field type) can be passed any pickle-able object.
+
+Whoosh will happily allow you to add documents with identical values, which can
+be useful or annoying depending on what you're using the library for::
+
+ writer.add_document(path=u"/a", title=u"A", content=u"Hello there")
+ writer.add_document(path=u"/a", title=u"A", content=u"Deja vu!")
+
+This adds two documents to the index with identical path and title fields. See
+"updating documents" below for information on the ``update_document`` method, which
+uses "unique" fields to replace old documents instead of appending.
+
+
+Indexing and storing different values for the same field
+--------------------------------------------------------
+
+If you have a field that is both indexed and stored, you can index a unicode
+value but store a different object if necessary (it's usually not, but sometimes
+this is really useful) using a "special" keyword argument ``_stored_<fieldname>``.
+The normal value will be analyzed and indexed, but the "stored" value will show
+up in the results::
+
+ writer.add_document(title=u"Title to be indexed", _stored_title=u"Stored title")
+
+
+Finishing adding documents
+--------------------------
+
+An ``IndexWriter`` object is kind of like a database transaction. You specify a
+bunch of changes to the index, and then "commit" them all at once.
+
+Calling ``commit()`` on the ``IndexWriter`` saves the added documents to the
+index::
+
+ writer.commit()
+
+Once your documents are in the index, you can search for them.
+
+If you want to close the writer without committing the changes, call
+``cancel()`` instead of ``commit()``::
+
+ writer.cancel()
+
+Keep in mind that while you have a writer open (including a writer you opened
+and is still in scope), no other thread or process can get a writer or modify
+the index. A writer also keeps several open files. So you should always remember
+to call either ``commit()`` or ``cancel()`` when you're done with a writer object.
+
+
+Merging segments
+================
+
+A Whoosh ``filedb`` index is really a container for one or more "sub-indexes"
+called segments. When you add documents to an index, instead of integrating the
+new documents with the existing documents (which could potentially be very
+expensive, since it involves resorting all the indexed terms on disk), Whoosh
+creates a new segment next to the existing segment. Then when you search the
+index, Whoosh searches both segments individually and merges the results so the
+segments appear to be one unified index. (This smart design is copied from
+Lucene.)
+
+So, having a few segments is more efficient than rewriting the entire index
+every time you add some documents. But searching multiple segments does slow
+down searching somewhat, and the more segments you have, the slower it gets. So
+Whoosh has an algorithm that runs when you call ``commit()`` that looks for small
+segments it can merge together to make fewer, bigger segments.
+
+To prevent Whoosh from merging segments during a commit, use the ``merge``
+keyword argument::
+
+ writer.commit(merge=False)
+
+To merge all segments together, optimizing the index into a single segment,
+use the ``optimize`` keyword argument::
+
+ writer.commit(optimize=True)
+
+Since optimizing rewrites all the information in the index, it can be slow on
+a large index. It's generally better to rely on Whoosh's merging algorithm than
+to optimize all the time.
+
+(The ``Index`` object also has an ``optimize()`` method that lets you optimize the
+index (merge all the segments together). It simply creates a writer and calls
+``commit(optimize=True)`` on it.)
+
+For more control over segment merging, you can write your own merge policy
+function and use it as an argument to the ``commit()`` method. See the
+implementation of the ``NO_MERGE``, ``MERGE_SMALL``, and ``OPTIMIZE`` functions
+in the ``whoosh.writing`` module.
+
+
+Deleting documents
+==================
+
+You can delete documents using the following methods on an ``IndexWriter``
+object. You then need to call ``commit()`` on the writer to save the deletions
+to disk.
+
+``delete_document(docnum)``
+
+ Low-level method to delete a document by its internal document number.
+
+``is_deleted(docnum)``
+
+ Low-level method, returns ``True`` if the document with the given internal
+ number is deleted.
+
+``delete_by_term(fieldname, termtext)``
+
+ Deletes any documents where the given (indexed) field contains the given
+ term. This is mostly useful for ``ID`` or ``KEYWORD`` fields.
+
+``delete_by_query(query)``
+
+ Deletes any documents that match the given query.
+
+::
+
+ # Delete document by its path -- this field must be indexed
+ ix.delete_by_term('path', u'/a/b/c')
+ # Save the deletion to disk
+ ix.commit()
+
+In the ``filedb`` backend, "deleting" a document simply adds the document number
+to a list of deleted documents stored with the index. When you search the index,
+it knows not to return deleted documents in the results. However, the document's
+contents are still stored in the index, and certain statistics (such as term
+document frequencies) are not updated, until you merge the segments containing
+deleted documents (see merging above). (This is because removing the information
+immediately from the index would essentially involving rewriting the entire
+index on disk, which would be very inefficient.)
+
+
+Updating documents
+==================
+
+If you want to "replace" (re-index) a document, you can delete the old document
+using one of the ``delete_*`` methods on ``Index`` or ``IndexWriter``, then use
+``IndexWriter.add_document`` to add the new version. Or, you can use
+``IndexWriter.update_document`` to do this in one step.
+
+For ``update_document`` to work, you must have marked at least one of the fields
+in the schema as "unique". Whoosh will then use the contents of the "unique"
+field(s) to search for documents to delete::
+
+ from whoosh.fields import Schema, ID, TEXT
+
+ schema = Schema(path = ID(unique=True), content=TEXT)
+
+ ix = index.create_in("index")
+ writer = ix.writer()
+ writer.add_document(path=u"/a", content=u"The first document")
+ writer.add_document(path=u"/b", content=u"The second document")
+ writer.commit()
+
+ writer = ix.writer()
+ # Because "path" is marked as unique, calling update_document with path="/a"
+ # will delete any existing documents where the "path" field contains "/a".
+ writer.update_document(path=u"/a", content="Replacement for the first document")
+ writer.commit()
+
+The "unique" field(s) must be indexed.
+
+If no existing document matches the unique fields of the document you're
+updating, ``update_document`` acts just like ``add_document``.
+
+"Unique" fields and ``update_document`` are simply convenient shortcuts for deleting
+and adding. Whoosh has no inherent concept of a unique identifier, and in no way
+enforces uniqueness when you use ``add_document``.
+
+
+Incremental indexing
+====================
+
+When you're indexing a collection of documents, you'll often want two code
+paths: one to index all the documents from scratch, and one to only update the
+documents that have changed (leaving aside web applications where you need to
+add/update documents according to user actions).
+
+Indexing everything from scratch is pretty easy. Here's a simple example::
+
+ import os.path
+ from whoosh import index
+ from whoosh.fields import Schema, ID, TEXT
+
+ def clean_index(dirname):
+ # Always create the index from scratch
+ ix = index.create_in(dirname, schema=get_schema())
+ writer = ix.writer()
+
+ # Assume we have a function that gathers the filenames of the
+ # documents to be indexed
+ for path in my_docs():
+ add_doc(writer, path)
+
+ writer.commit()
+
+
+ def get_schema()
+ return Schema(path=ID(unique=True, stored=True), content=TEXT)
+
+
+ def add_doc(writer, path):
+ fileobj = open(path, "rb")
+ content = fileobj.read()
+ fileobj.close()
+ writer.add_document(path=path, content=content)
+
+Now, for a small collection of documents, indexing from scratch every time might
+actually be fast enough. But for large collections, you'll want to have the
+script only re-index the documents that have changed.
+
+To start we'll need to store each document's last-modified time, so we can check
+if the file has changed. In this example, we'll just use the mtime for
+simplicity::
+
+ def get_schema()
+ return Schema(path=ID(unique=True, stored=True), time=STORED, content=TEXT)
+
+ def add_doc(writer, path):
+ fileobj = open(path, "rb")
+ content = fileobj.read()
+ fileobj.close()
+ modtime = os.path.getmtime(path)
+ writer.add_document(path=path, content=content, time=modtime)
+
+Now we can modify the script to allow either "clean" (from scratch) or
+incremental indexing::
+
+ def index_my_docs(dirname, clean=False):
+ if clean:
+ clean_index(dirname)
+ else:
+ incremental_index(dirname)
+
+
+ def incremental_index(dirname)
+ ix = index.open_dir(dirname)
+
+ # The set of all paths in the index
+ indexed_paths = set()
+ # The set of all paths we need to re-index
+ to_index = set()
+
+ with ix.searcher() as searcher:
+ writer = ix.writer()
+
+ # Loop over the stored fields in the index
+ for fields in searcher.all_stored_fields():
+ indexed_path = fields['path']
+ indexed_paths.add(indexed_path)
+
+ if not os.path.exists(indexed_path):
+ # This file was deleted since it was indexed
+ writer.delete_by_term('path', indexed_path)
+
+ else:
+ # Check if this file was changed since it
+ # was indexed
+ indexed_time = fields['time']
+ mtime = os.path.getmtime(indexed_path)
+ if mtime > indexed_time:
+ # The file has changed, delete it and add it to the list of
+ # files to reindex
+ writer.delete_by_term('path', indexed_path)
+ to_index.add(indexed_path)
+
+ # Loop over the files in the filesystem
+ # Assume we have a function that gathers the filenames of the
+ # documents to be indexed
+ for path in my_docs():
+ if path in to_index or path not in indexed_paths:
+ # This is either a file that's changed, or a new file
+ # that wasn't indexed before. So index it!
+ add_doc(writer, path)
+
+ writer.commit()
+
+The ``incremental_index`` function:
+
+* Loops through all the paths that are currently indexed.
+
+ * If any of the files no longer exist, delete the corresponding document from
+ the index.
+
+ * If the file still exists, but has been modified, add it to the list of paths
+ to be re-indexed.
+
+ * If the file exists, whether it's been modified or not, add it to the list of
+ all indexed paths.
+
+* Loops through all the paths of the files on disk.
+
+ * If a path is not in the set of all indexed paths, the file is new and we
+ need to index it.
+
+ * If a path is in the set of paths to re-index, we need to index it.
+
+ * Otherwise, we can skip indexing the file.
+
+
+Clearing the index
+==================
+
+In some cases you may want to re-index from scratch. To clear the index without
+disrupting any existing readers::
+
+ from whoosh import writing
+
+ with myindex.writer() as mywriter:
+ # You can optionally add documents to the writer here
+ # e.g. mywriter.add_document(...)
+
+ # Using mergetype=CLEAR clears all existing segments so the index will
+ # only have any documents you've added to this writer
+ mywriter.mergetype = writing.CLEAR
+
+Or, if you don't use the writer as a context manager and call ``commit()``
+directly, do it like this::
+
+ mywriter = myindex.writer()
+ # ...
+ mywriter.commit(mergetype=writing.CLEAR)
+
+.. note::
+ If you don't need to worry about existing readers, a more efficient method
+ is to simply delete the contents of the index directory and start over.
diff --git a/docs/source/intro.rst b/docs/source/intro.rst
new file mode 100644
index 0000000..95c70d0
--- /dev/null
+++ b/docs/source/intro.rst
@@ -0,0 +1,60 @@
+======================
+Introduction to Whoosh
+======================
+
+About Whoosh
+------------
+
+Whoosh was created by `Matt Chaput <mailto:matt@whoosh.ca>`_. It started as a quick and dirty
+search server for the online documentation of the `Houdini <http://www.sidefx.com/>`_
+3D animation software package. Side Effects Software generously allowed Matt to open source
+the code in case it might be useful to anyone else who needs a very flexible or pure-Python
+search engine (or both!).
+
+* Whoosh is fast, but uses only pure Python, so it will run anywhere Python runs,
+ without requiring a compiler.
+
+* By default, Whoosh uses the `Okapi BM25F <http://en.wikipedia.com/wiki/Okapi_BM25>`_ ranking
+ function, but like most things the ranking function can be easily customized.
+
+* Whoosh creates fairly small indexes compared to many other search libraries.
+
+* All indexed text in Whoosh must be *unicode*.
+
+* Whoosh lets you store arbitrary Python objects with indexed documents.
+
+
+What is Whoosh?
+---------------
+
+Whoosh is a fast, pure Python search engine library.
+
+The primary design impetus of Whoosh is that it is pure Python. You should be able to
+use Whoosh anywhere you can use Python, no compiler or Java required.
+
+Like one of its ancestors, Lucene, Whoosh is not really a search engine, it's a programmer
+library for creating a search engine [1]_.
+
+Practically no important behavior of Whoosh is hard-coded. Indexing
+of text, the level of information stored for each term in each field, parsing of search queries,
+the types of queries allowed, scoring algorithms, etc. are all customizable, replaceable, and
+extensible.
+
+
+.. [1] It would of course be possible to build a turnkey search engine on top of Whoosh,
+ like Nutch and Solr use Lucene.
+
+
+What can Whoosh do for you?
+---------------------------
+
+Whoosh lets you index free-form or structured text and then quickly find matching
+documents based on simple or complex search criteria.
+
+
+Getting help with Whoosh
+------------------------
+
+You can view outstanding issues on the
+`Whoosh Bitbucket page <http://bitbucket.org/mchaput/whoosh>`_
+and get help on the `Whoosh mailing list <http://groups.google.com/group/whoosh>`_.
diff --git a/docs/source/keywords.rst b/docs/source/keywords.rst
new file mode 100644
index 0000000..82bb6cd
--- /dev/null
+++ b/docs/source/keywords.rst
@@ -0,0 +1,94 @@
+=======================================
+Query expansion and Key word extraction
+=======================================
+
+Overview
+========
+
+Whoosh provides methods for computing the "key terms" of a set of documents. For
+these methods, "key terms" basically means terms that are frequent in the given
+documents, but relatively infrequent in the indexed collection as a whole.
+
+Because this is a purely statistical operation, not a natural language
+processing or AI function, the quality of the results will vary based on the
+content, the size of the document collection, and the number of documents for
+which you extract keywords.
+
+These methods can be useful for providing the following features to users:
+
+* Search term expansion. You can extract key terms for the top N results from a
+ query and suggest them to the user as additional/alternate query terms to try.
+
+* Tag suggestion. Extracting the key terms for a single document may yield
+ useful suggestions for tagging the document.
+
+* "More like this". You can extract key terms for the top ten or so results from
+ a query (and removing the original query terms), and use those key words as
+ the basis for another query that may find more documents using terms the user
+ didn't think of.
+
+Usage
+=====
+
+* Get more documents like a certain search hit. *This requires that the field
+ you want to match on is vectored or stored, or that you have access to the
+ original text (such as from a database)*.
+
+ Use :meth:`~whoosh.searching.Hit.more_like_this`::
+
+ results = mysearcher.search(myquery)
+ first_hit = results[0]
+ more_results = first_hit.more_like_this("content")
+
+* Extract keywords for the top N documents in a
+ :class:`whoosh.searching.Results` object. *This requires that the field is
+ either vectored or stored*.
+
+ Use the :meth:`~whoosh.searching.Results.key_terms` method of the
+ :class:`whoosh.searching.Results` object to extract keywords from the top N
+ documents of the result set.
+
+ For example, to extract *five* key terms from the ``content`` field of the top
+ *ten* documents of a results object::
+
+ keywords = [keyword for keyword, score
+ in results.key_terms("content", docs=10, numterms=5)
+
+* Extract keywords for an arbitrary set of documents. *This requires that the
+ field is either vectored or stored*.
+
+ Use the :meth:`~whoosh.searching.Searcher.document_number` or
+ :meth:`~whoosh.searching.Searcher.document_numbers` methods of the
+ :class:`whoosh.searching.Searcher` object to get the document numbers for the
+ document(s) you want to extract keywords from.
+
+ Use the :meth:`~whoosh.searching.Searcher.key_terms` method of a
+ :class:`whoosh.searching.Searcher` to extract the keywords, given the list of
+ document numbers.
+
+ For example, let's say you have an index of emails. To extract key terms from
+ the ``content`` field of emails whose ``emailto`` field contains
+ ``matt@whoosh.ca``::
+
+ with email_index.searcher() as s:
+ docnums = s.document_numbers(emailto=u"matt@whoosh.ca")
+ keywords = [keyword for keyword, score
+ in s.key_terms(docnums, "body")]
+
+* Extract keywords from arbitrary text not in the index.
+
+ Use the :meth:`~whoosh.searching.Searcher.key_terms_from_text` method of a
+ :class:`whoosh.searching.Searcher` to extract the keywords, given the text::
+
+ with email_index.searcher() as s:
+ keywords = [keyword for keyword, score
+ in s.key_terms_from_text("body", mytext)]
+
+
+Expansion models
+================
+
+The ``ExpansionModel`` subclasses in the :mod:`whoosh.classify` module implement
+different weighting functions for key words. These models are translated into
+Python from original Java implementations in Terrier.
+
diff --git a/docs/source/nested.rst b/docs/source/nested.rst
new file mode 100644
index 0000000..106a350
--- /dev/null
+++ b/docs/source/nested.rst
@@ -0,0 +1,238 @@
+===========================================
+Indexing and searching document hierarchies
+===========================================
+
+Overview
+========
+
+Whoosh's full-text index is essentially a flat database of documents. However,
+Whoosh supports two techniques for simulating the indexing and querying of
+hierarchical documents, that is, sets of documents that form a parent-child
+hierarchy, such as "Chapter - Section - Paragraph" or
+"Module - Class - Method".
+
+You can specify parent-child relationships *at indexing time*, by grouping
+documents in the same hierarchy, and then use the
+:class:`whoosh.query.NestedParent` and/or :class:`whoosh.query.NestedChildren`
+to find parents based on their children or vice-versa.
+
+Alternatively, you can use *query time joins*, essentially like external key
+joins in a database, where you perform one search to find a relevant document,
+then use a stored value on that document (for example, a ``parent`` field) to
+look up another document.
+
+Both methods have pros and cons.
+
+
+Using nested document indexing
+==============================
+
+Indexing
+--------
+
+This method works by indexing a "parent" document and all its "child" documents
+*as a "group"* so they are guaranteed to end up in the same segment. You can
+use the context manager returned by ``IndexWriter.group()`` to group
+documents::
+
+ with ix.writer() as w:
+ with w.group():
+ w.add_document(kind="class", name="Index")
+ w.add_document(kind="method", name="add document")
+ w.add_document(kind="method", name="add reader")
+ w.add_document(kind="method", name="close")
+ with w.group():
+ w.add_document(kind="class", name="Accumulator")
+ w.add_document(kind="method", name="add")
+ w.add_document(kind="method", name="get result")
+ with w.group():
+ w.add_document(kind="class", name="Calculator")
+ w.add_document(kind="method", name="add")
+ w.add_document(kind="method", name="add all")
+ w.add_document(kind="method", name="add some")
+ w.add_document(kind="method", name="multiply")
+ w.add_document(kind="method", name="close")
+ with w.group():
+ w.add_document(kind="class", name="Deleter")
+ w.add_document(kind="method", name="add")
+ w.add_document(kind="method", name="delete")
+
+Alternatively you can use the ``start_group()`` and ``end_group()`` methods::
+
+ with ix.writer() as w:
+ w.start_group()
+ w.add_document(kind="class", name="Index")
+ w.add_document(kind="method", name="add document")
+ w.add_document(kind="method", name="add reader")
+ w.add_document(kind="method", name="close")
+ w.end_group()
+
+Each level of the hierarchy should have a query that distinguishes it from
+other levels (for example, in the above index, you can use ``kind:class`` or
+``kind:method`` to match different levels of the hierarchy).
+
+Once you've indexed the hierarchy of documents, you can use two query types to
+find parents based on children or vice-versa.
+
+(There is currently no support in the default query parser for nested queries.)
+
+
+NestedParent query
+------------------
+
+The :class:`whoosh.query.NestedParent` query type lets you specify a query for
+child documents, but have the query return an "ancestor" document from higher
+in the hierarchy::
+
+ # First, we need a query that matches all the documents in the "parent"
+ # level we want of the hierarchy
+ all_parents = query.Term("kind", "class")
+
+ # Then, we need a query that matches the children we want to find
+ wanted_kids = query.Term("name", "close")
+
+ # Now we can make a query that will match documents where "name" is
+ # "close", but the query will return the "parent" documents of the matching
+ # children
+ q = query.NestedParent(all_parents, wanted_kids)
+ # results = Index, Calculator
+
+Note that in a hierarchy with more than two levels, you can specify a "parents"
+query that matches any level of the hierarchy, so you can return the top-level
+ancestors of the matching children, or the second level, third level, etc.
+
+The query works by first building a bit vector representing which documents are
+"parents"::
+
+ Index
+ | Calculator
+ | |
+ 1000100100000100
+ | |
+ | Deleter
+ Accumulator
+
+Then for each match of the "child" query, it calculates the previous parent
+from the bit vector and returns it as a match (it only returns each parent once
+no matter how many children match). This parent lookup is very efficient::
+
+ 1000100100000100
+ |
+ |<-+ close
+
+
+NestedChildren query
+--------------------
+
+The opposite of ``NestedParent`` is :class:`whoosh.query.NestedChildren`. This
+query lets you match parents but return their children. This is useful, for
+example, to search for an album title and return the songs in the album::
+
+ # Query that matches all documents in the "parent" level we want to match
+ # at
+ all_parents = query.Term("kind", "album")
+
+ # Parent documents we want to match
+ wanted_parents = query.Term("album_title", "heaven")
+
+ # Now we can make a query that will match parent documents where "album_title"
+ # contains "heaven", but the query will return the "child" documents of the
+ # matching parents
+ q1 = query.NestedChildren(all_parents, wanted_parents)
+
+You can then combine that query with an ``AND`` clause, for example to find
+songs with "hell" in the song title that occur on albums with "heaven" in the
+album title::
+
+ q2 = query.And([q1, query.Term("song_title", "hell")])
+
+
+Deleting and updating hierarchical documents
+--------------------------------------------
+
+The drawback of the index-time method is *updating and deleting*. Because the
+implementation of the queries depends on the parent and child documents being
+contiguous in the segment, you can't update/delete just one child document.
+You can only update/delete an entire top-level document at once (for example,
+if your hierarchy is "Chapter - Section - Paragraph", you can only update or
+delete entire chapters, not a section or paragraph). If the top-level of the
+hierarchy represents very large blocks of text, this can involve a lot of
+deleting and reindexing.
+
+Currently ``Writer.update_document()`` does not automatically work with nested
+documents. You must manually delete and re-add document groups to update them.
+
+To delete nested document groups, use the ``Writer.delete_by_query()``
+method with a ``NestedParent`` query::
+
+ # Delete the "Accumulator" class
+ all_parents = query.Term("kind", "class")
+ to_delete = query.Term("name", "Accumulator")
+ q = query.NestedParent(all_parents, to_delete)
+ with myindex.writer() as w:
+ w.delete_by_query(q)
+
+
+Using query-time joins
+======================
+
+A second technique for simulating hierarchical documents in Whoosh involves
+using a stored field on each document to point to its parent, and then using
+the value of that field at query time to find parents and children.
+
+For example, if we index a hierarchy of classes and methods using pointers
+to parents instead of nesting::
+
+ # Store a pointer to the parent on each "method" document
+ with ix.writer() as w:
+ w.add_document(kind="class", c_name="Index", docstring="...")
+ w.add_document(kind="method", m_name="add document", parent="Index")
+ w.add_document(kind="method", m_name="add reader", parent="Index")
+ w.add_document(kind="method", m_name="close", parent="Index")
+
+ w.add_document(kind="class", c_name="Accumulator", docstring="...")
+ w.add_document(kind="method", m_name="add", parent="Accumulator")
+ w.add_document(kind="method", m_name="get result", parent="Accumulator")
+
+ w.add_document(kind="class", c_name="Calculator", docstring="...")
+ w.add_document(kind="method", m_name="add", parent="Calculator")
+ w.add_document(kind="method", m_name="add all", parent="Calculator")
+ w.add_document(kind="method", m_name="add some", parent="Calculator")
+ w.add_document(kind="method", m_name="multiply", parent="Calculator")
+ w.add_document(kind="method", m_name="close", parent="Calculator")
+
+ w.add_document(kind="class", c_name="Deleter", docstring="...")
+ w.add_document(kind="method", m_name="add", parent="Deleter")
+ w.add_document(kind="method", m_name="delete", parent="Deleter")
+
+ # Now do manual joins at query time
+ with ix.searcher() as s:
+ # Tip: Searcher.document() and Searcher.documents() let you look up
+ # documents by field values more easily than using Searcher.search()
+
+ # Children to parents:
+ # Print the docstrings of classes on which "close" methods occur
+ for child_doc in s.documents(m_name="close"):
+ # Use the stored value of the "parent" field to look up the parent
+ # document
+ parent_doc = s.document(c_name=child_doc["parent"])
+ # Print the parent document's stored docstring field
+ print(parent_doc["docstring"])
+
+ # Parents to children:
+ # Find classes with "big" in the docstring and print their methods
+ q = query.Term("kind", "class") & query.Term("docstring", "big")
+ for hit in s.search(q, limit=None):
+ print("Class name=", hit["c_name"], "methods:")
+ for child_doc in s.documents(parent=hit["c_name"]):
+ print(" Method name=", child_doc["m_name"])
+
+This technique is more flexible than index-time nesting in that you can
+delete/update individual documents in the hierarchy piece by piece, although it
+doesn't support finding different parent levels as easily. It is also slower
+than index-time nesting (potentially much slower), since you must perform
+additional searches for each found document.
+
+Future versions of Whoosh may include "join" queries to make this process more
+efficient (or at least more automatic).
+
diff --git a/docs/source/ngrams.rst b/docs/source/ngrams.rst
new file mode 100644
index 0000000..484a271
--- /dev/null
+++ b/docs/source/ngrams.rst
@@ -0,0 +1,51 @@
+==============================
+Indexing and searching N-grams
+==============================
+
+Overview
+========
+
+N-gram indexing is a powerful method for getting fast, "search as you type"
+functionality like iTunes. It is also useful for quick and effective indexing
+of languages such as Chinese and Japanese without word breaks.
+
+N-grams refers to groups of N characters... bigrams are groups of two
+characters, trigrams are groups of three characters, and so on.
+
+Whoosh includes two methods for analyzing N-gram fields: an N-gram tokenizer,
+and a filter that breaks tokens into N-grams.
+
+:class:`whoosh.analysis.NgramTokenizer` tokenizes the entire field into N-grams.
+This is more useful for Chinese/Japanese/Korean languages, where it's useful
+to index bigrams of characters rather than individual characters. Using this
+tokenizer with roman languages leads to spaces in the tokens.
+
+::
+
+ >>> ngt = NgramTokenizer(minsize=2, maxsize=4)
+ >>> [token.text for token in ngt(u"hi there")]
+ [u'hi', u'hi ', u'hi t',u'i ', u'i t', u'i th', u' t', u' th', u' the', u'th',
+ u'the', u'ther', u'he', u'her', u'here', u'er', u'ere', u're']
+
+:class:`whoosh.analysis.NgramFilter` breaks individual tokens into N-grams as
+part of an analysis pipeline. This is more useful for languages with word
+separation.
+
+::
+
+ >>> my_analyzer = StandardAnalyzer() | NgramFilter(minsize=2, maxsize=4)
+ >>> [token.text for token in my_analyzer(u"rendering shaders")]
+ [u'ren', u'rend', u'end', u'ende', u'nde', u'nder', u'der', u'deri', u'eri',
+ u'erin', u'rin', u'ring', u'ing', u'sha', u'shad', u'had', u'hade', u'ade',
+ u'ader', u'der', u'ders', u'ers']
+
+Whoosh includes two pre-configured field types for N-grams:
+:class:`whoosh.fields.NGRAM` and :class:`whoosh.fields.NGRAMWORDS`. The only
+difference is that ``NGRAM`` runs all text through the N-gram filter, including
+whitespace and punctuation, while ``NGRAMWORDS`` extracts words from the text
+using a tokenizer, then runs each word through the N-gram filter.
+
+TBD.
+
+
+
diff --git a/docs/source/parsing.rst b/docs/source/parsing.rst
new file mode 100644
index 0000000..35327c7
--- /dev/null
+++ b/docs/source/parsing.rst
@@ -0,0 +1,437 @@
+====================
+Parsing user queries
+====================
+
+Overview
+========
+
+The job of a query parser is to convert a *query string* submitted by a user
+into *query objects* (objects from the :mod:`whoosh.query` module).
+
+For example, the user query:
+
+.. code-block:: none
+
+ rendering shading
+
+might be parsed into query objects like this::
+
+ And([Term("content", u"rendering"), Term("content", u"shading")])
+
+Whoosh includes a powerful, modular parser for user queries in the
+:mod:`whoosh.qparser` module. The default parser implements a query language
+similar to the one that ships with Lucene. However, by changing plugins or using
+functions such as :func:`whoosh.qparser.MultifieldParser`,
+:func:`whoosh.qparser.SimpleParser` or :func:`whoosh.qparser.DisMaxParser`, you
+can change how the parser works, get a simpler parser or change the query
+language syntax.
+
+(In previous versions of Whoosh, the query parser was based on ``pyparsing``.
+The new hand-written parser is less brittle and more flexible.)
+
+.. note::
+
+ Remember that you can directly create query objects programmatically using
+ the objects in the :mod:`whoosh.query` module. If you are not processing
+ actual user queries, this is preferable to building a query string just to
+ parse it.
+
+
+Using the default parser
+========================
+
+To create a :class:`whoosh.qparser.QueryParser` object, pass it the name of the
+*default field* to search and the schema of the index you'll be searching.
+
+::
+
+ from whoosh.qparser import QueryParser
+
+ parser = QueryParser("content", schema=myindex.schema)
+
+.. tip::
+
+ You can instantiate a ``QueryParser`` object without specifying a schema,
+ however the parser will not process the text of the user query. This is
+ useful for debugging, when you want to see how QueryParser will build a
+ query, but don't want to make up a schema just for testing.
+
+Once you have a ``QueryParser`` object, you can call ``parse()`` on it to parse a
+query string into a query object::
+
+ >>> parser.parse(u"alpha OR beta gamma")
+ And([Or([Term('content', u'alpha'), Term('content', u'beta')]), Term('content', u'gamma')])
+
+See the :doc:`query language reference <querylang>` for the features and syntax
+of the default parser's query language.
+
+
+Common customizations
+=====================
+
+Searching for any terms instead of all terms by default
+-------------------------------------------------------
+
+If the user doesn't explicitly specify ``AND`` or ``OR`` clauses::
+
+ physically based rendering
+
+...by default, the parser treats the words as if they were connected by ``AND``,
+meaning all the terms must be present for a document to match::
+
+ physically AND based AND rendering
+
+To change the parser to use ``OR`` instead, so that any of the terms may be
+present for a document to match, i.e.::
+
+ physically OR based OR rendering
+
+...configure the QueryParser using the ``group`` keyword argument like this::
+
+ from whoosh import qparser
+
+ parser = qparser.QueryParser(fieldname, schema=myindex.schema,
+ group=qparser.OrGroup)
+
+The Or query lets you specify that documents that contain more of the query
+terms score higher. For example, if the user searches for ``foo bar``, a
+document with four occurances of ``foo`` would normally outscore a document
+that contained one occurance each of ``foo`` and ``bar``. However, users
+usually expect documents that contain more of the words they searched for
+to score higher. To configure the parser to produce Or groups with this
+behavior, use the ``factory()`` class method of ``OrGroup``::
+
+ og = qparser.OrGroup.factory(0.9)
+ parser = qparser.QueryParser(fieldname, schema, group=og)
+
+where the argument to ``factory()`` is a scaling factor on the bonus
+(between 0 and 1).
+
+
+Letting the user search multiple fields by default
+--------------------------------------------------
+
+The default QueryParser configuration takes terms without explicit fields and
+assigns them to the default field you specified when you created the object, so
+for example if you created the object with::
+
+ parser = QueryParser("content", schema=myschema)
+
+And the user entered the query:
+
+.. code-block:: none
+
+ three blind mice
+
+The parser would treat it as:
+
+.. code-block:: none
+
+ content:three content:blind content:mice
+
+However, you might want to let the user search *multiple* fields by default. For
+example, you might want "unfielded" terms to search both the ``title`` and
+``content`` fields.
+
+In that case, you can use a :class:`whoosh.qparser.MultifieldParser`. This is
+just like the normal QueryParser, but instead of a default field name string, it
+takes a *sequence* of field names::
+
+ from whoosh.qparser import MultifieldParser
+
+ mparser = MultifieldParser(["title", "content"], schema=myschema)
+
+When this MultifieldParser instance parses ``three blind mice``, it treats it
+as:
+
+.. code-block:: none
+
+ (title:three OR content:three) (title:blind OR content:blind) (title:mice OR content:mice)
+
+
+Simplifying the query language
+------------------------------
+
+Once you have a parser::
+
+ parser = qparser.QueryParser("content", schema=myschema)
+
+you can remove features from it using the
+:meth:`~whoosh.qparser.QueryParser.remove_plugin_class` method.
+
+For example, to remove the ability of the user to specify fields to search::
+
+ parser.remove_plugin_class(qparser.FieldsPlugin)
+
+To remove the ability to search for wildcards, which can be harmful to query
+performance::
+
+ parser.remove_plugin_class(qparser.WildcardPlugin)
+
+See :doc:`/api/qparser` for information about the plugins included with
+Whoosh's query parser.
+
+
+Changing the AND, OR, ANDNOT, ANDMAYBE, and NOT syntax
+------------------------------------------------------
+
+The default parser uses English keywords for the AND, OR, ANDNOT, ANDMAYBE,
+and NOT functions::
+
+ parser = qparser.QueryParser("content", schema=myschema)
+
+You can replace the default ``OperatorsPlugin`` object to
+replace the default English tokens with your own regular expressions.
+
+The :class:`whoosh.qparser.OperatorsPlugin` implements the ability to use AND,
+OR, NOT, ANDNOT, and ANDMAYBE clauses in queries. You can instantiate a new
+``OperatorsPlugin`` and use the ``And``, ``Or``, ``Not``, ``AndNot``, and
+``AndMaybe`` keyword arguments to change the token patterns::
+
+ # Use Spanish equivalents instead of AND and OR
+ op = qparser.OperatorsPlugin(And=" Y ", Or=" O ")
+ parser.replace_plugin(op)
+
+Further, you may change the syntax of the ``NOT`` operator::
+
+ np = qparser.OperatorsPlugin(Not=' NO ')
+ parser.replace_plugin(np)
+
+The arguments can be pattern strings or precompiled regular expression objects.
+
+For example, to change the default parser to use typographic symbols instead of
+words for the AND, OR, ANDNOT, ANDMAYBE, and NOT functions::
+
+ parser = qparser.QueryParser("content", schema=myschema)
+ # These are regular expressions, so we have to escape the vertical bar
+ op = qparser.OperatorsPlugin(And="&", Or="\\|", AndNot="&!", AndMaybe="&~", Not="\\-")
+ parser.replace_plugin(op)
+
+
+Adding less-than, greater-than, etc.
+------------------------------------
+
+Normally, the way you match all terms in a field greater than "apple" is with
+an open ended range::
+
+ field:{apple to]
+
+The :class:`whoosh.qparser.GtLtPlugin` lets you specify the same search like
+this::
+
+ field:>apple
+
+The plugin lets you use ``>``, ``<``, ``>=``, ``<=``, ``=>``, or ``=<`` after
+a field specifier, and translates the expression into the equivalent range::
+
+ date:>='31 march 2001'
+
+ date:[31 march 2001 to]
+
+
+Adding fuzzy term queries
+-------------------------
+
+Fuzzy queries are good for catching misspellings and similar words.
+The :class:`whoosh.qparser.FuzzyTermPlugin` lets you search for "fuzzy" terms,
+that is, terms that don't have to match exactly. The fuzzy term will match any
+similar term within a certain number of "edits" (character insertions,
+deletions, and/or transpositions -- this is called the "Damerau-Levenshtein
+edit distance").
+
+To add the fuzzy plugin::
+
+ parser = qparser.QueryParser("fieldname", my_index.schema)
+ parser.add_plugin(qparser.FuzzyTermPlugin())
+
+Once you add the fuzzy plugin to the parser, you can specify a fuzzy term by
+adding a ``~`` followed by an optional maximum edit distance. If you don't
+specify an edit distance, the default is ``1``.
+
+For example, the following "fuzzy" term query::
+
+ cat~
+
+would match ``cat`` and all terms in the index within one "edit" of cat,
+for example ``cast`` (insert ``s``), ``at`` (delete ``c``), and ``act``
+(transpose ``c`` and ``a``).
+
+If you wanted ``cat`` to match ``bat``, it requires two edits (delete ``c`` and
+insert ``b``) so you would need to set the maximum edit distance to ``2``::
+
+ cat~2
+
+Because each additional edit you allow increases the number of possibilities
+that must be checked, edit distances greater than ``2`` can be very slow.
+
+It is often useful to require that the first few characters of a fuzzy term
+match exactly. This is called a prefix. You can set the length of the prefix
+by adding a slash and a number after the edit distance. For example, to use
+a maximum edit distance of ``2`` and a prefix length of ``3``::
+
+ johannson~2/3
+
+You can specify a prefix without specifying an edit distance::
+
+ johannson~/3
+
+The default prefix distance is ``0``.
+
+
+Allowing complex phrase queries
+-------------------------------
+
+The default parser setup allows phrase (proximity) queries such as::
+
+ "whoosh search library"
+
+The default phrase query tokenizes the text between the quotes and creates a
+search for those terms in proximity.
+
+If you want to do more complex proximity searches, you can replace the phrase
+plugin with the :class:`whoosh.qparser.SequencePlugin`, which allows any query
+between the quotes. For example::
+
+ "(john OR jon OR jonathan~) peters*"
+
+The sequence syntax lets you add a "slop" factor just like the regular phrase::
+
+ "(john OR jon OR jonathan~) peters*"~2
+
+To replace the default phrase plugin with the sequence plugin::
+
+ parser = qparser.QueryParser("fieldname", my_index.schema)
+ parser.remove_plugin_class(qparser.PhrasePlugin)
+ parser.add_plugin(qparser.SequencePlugin())
+
+Alternatively, you could keep the default phrase plugin and give the sequence
+plugin different syntax by specifying a regular expression for the start/end
+marker when you create the sequence plugin. The regular expression should have
+a named group ``slop`` for the slop factor. For example::
+
+ parser = qparser.QueryParser("fieldname", my_index.schema)
+ parser.add_plugin(qparser.SequencePlugin("!(~(?P<slop>[1-9][0-9]*))?"))
+
+This would allow you to use regular phrase queries and sequence queries at the
+same time::
+
+ "regular phrase" AND !sequence query~2!
+
+
+Advanced customization
+======================
+
+QueryParser arguments
+---------------------
+
+QueryParser supports two extra keyword arguments:
+
+``group``
+ The query class to use to join sub-queries when the user doesn't explicitly
+ specify a boolean operator, such as ``AND`` or ``OR``. This lets you change
+ the default operator from ``AND`` to ``OR``.
+
+ This will be the :class:`whoosh.qparser.AndGroup` or
+ :class:`whoosh.qparser.OrGroup` class (*not* an instantiated object) unless
+ you've written your own custom grouping syntax you want to use.
+
+``termclass``
+ The query class to use to wrap single terms.
+
+ This must be a :class:`whoosh.query.Query` subclass (*not* an instantiated
+ object) that accepts a fieldname string and term text unicode string in its
+ ``__init__`` method. The default is :class:`whoosh.query.Term`.
+
+ This is useful if you want to change the default term class to
+ :class:`whoosh.query.Variations`, or if you've written a custom term class
+ you want the parser to use instead of the ones shipped with Whoosh.
+
+::
+
+ >>> from whoosh.qparser import QueryParser, OrGroup
+ >>> orparser = QueryParser("content", schema=myschema, group=OrGroup)
+
+
+Configuring plugins
+-------------------
+
+The query parser's functionality is provided by a set of plugins. You can
+remove plugins to remove functionality, add plugins to add functionality, or
+replace default plugins with re-configured or rewritten versions.
+
+The :meth:`whoosh.qparser.QueryParser.add_plugin`,
+:meth:`whoosh.qparser.QueryParser.remove_plugin_class`, and
+:meth:`whoosh.qparser.QueryParser.replace_plugin` methods let you manipulate
+the plugins in a ``QueryParser`` object.
+
+See :doc:`/api/qparser` for information about the available plugins.
+
+
+.. _custom-op:
+
+Creating custom operators
+-------------------------
+
+* Decide whether you want a ``PrefixOperator``, ``PostfixOperator``, or ``InfixOperator``.
+
+* Create a new :class:`whoosh.qparser.syntax.GroupNode` subclass to hold
+ nodes affected by your operator. This object is responsible for generating
+ a :class:`whoosh.query.Query` object corresponding to the syntax.
+
+* Create a regular expression pattern for the operator's query syntax.
+
+* Create an ``OperatorsPlugin.OpTagger`` object from the above information.
+
+* Create a new ``OperatorsPlugin`` instance configured with your custom
+ operator(s).
+
+* Replace the default ``OperatorsPlugin`` in your parser with your new instance.
+
+For example, if you were creating a ``BEFORE`` operator::
+
+ from whoosh import qparser, query
+
+ optype = qparser.InfixOperator
+ pattern = " BEFORE "
+
+ class BeforeGroup(qparser.GroupNode):
+ merging = True
+ qclass = query.Ordered
+
+Create an OpTagger for your operator::
+
+ btagger = qparser.OperatorPlugin.OpTagger(pattern, BeforeGroup,
+ qparser.InfixOperator)
+
+By default, infix operators are left-associative. To make a right-associative
+infix operator, do this::
+
+ btagger = qparser.OperatorPlugin.OpTagger(pattern, BeforeGroup,
+ qparser.InfixOperator,
+ leftassoc=False)
+
+Create an :class:`~whoosh.qparser.plugins.OperatorsPlugin` instance with your
+new operator, and replace the default operators plugin in your query parser::
+
+ qp = qparser.QueryParser("text", myschema)
+ my_op_plugin = qparser.OperatorsPlugin([(btagger, 0)])
+ qp.replace_plugin(my_op_plugin)
+
+Note that the list of operators you specify with the first argument is IN
+ADDITION TO the default operators (AND, OR, etc.). To turn off one of the
+default operators, you can pass None to the corresponding keyword argument::
+
+ cp = qparser.OperatorsPlugin([(optagger, 0)], And=None)
+
+If you want ONLY your list of operators and none of the default operators,
+use the ``clean`` keyword argument::
+
+ cp = qparser.OperatorsPlugin([(optagger, 0)], clean=True)
+
+Operators earlier in the list bind more closely than operators later in the
+list.
+
+
+
+
+
diff --git a/docs/source/query.rst b/docs/source/query.rst
new file mode 100644
index 0000000..c62f555
--- /dev/null
+++ b/docs/source/query.rst
@@ -0,0 +1,10 @@
+=============
+Query objects
+=============
+
+The classes in the :mod:`whoosh.query` module implement *queries* you can run against the index.
+
+TBD.
+
+See :doc:`searching` for how to search the index using query objects.
+
diff --git a/docs/source/querylang.rst b/docs/source/querylang.rst
new file mode 100644
index 0000000..d2a214a
--- /dev/null
+++ b/docs/source/querylang.rst
@@ -0,0 +1,191 @@
+==========================
+The default query language
+==========================
+
+.. highlight:: none
+
+Overview
+========
+
+A query consists of *terms* and *operators*. There are two types of terms: single
+terms and *phrases*. Multiple terms can be combined with operators such as
+*AND* and *OR*.
+
+Whoosh supports indexing text in different *fields*. You must specify the
+*default field* when you create the :class:`whoosh.qparser.QueryParser` object.
+This is the field in which any terms the user does not explicitly specify a field
+for will be searched.
+
+Whoosh's query parser is capable of parsing different and/or additional syntax
+through the use of plug-ins. See :doc:`parsing`.
+
+
+Individual terms and phrases
+============================
+
+Find documents containing the term ``render``::
+
+ render
+
+Find documents containing the phrase ``all was well``::
+
+ "all was well"
+
+Note that a field must store Position information for phrase searching to work in
+that field.
+
+Normally when you specify a phrase, the maximum difference in position between
+each word in the phrase is 1 (that is, the words must be right next to each
+other in the document). For example, the following matches if a document has
+``library`` within 5 words after ``whoosh``::
+
+ "whoosh library"~5
+
+
+Boolean operators
+=================
+
+Find documents containing ``render`` *and* ``shading``::
+
+ render AND shading
+
+Note that AND is the default relation between terms, so this is the same as::
+
+ render shading
+
+Find documents containing ``render``, *and* also either ``shading`` *or*
+``modeling``::
+
+ render AND shading OR modeling
+
+Find documents containing ``render`` but *not* modeling::
+
+ render NOT modeling
+
+Find documents containing ``alpha`` but not either ``beta`` or ``gamma``::
+
+ alpha NOT (beta OR gamma)
+
+Note that when no boolean operator is specified between terms, the parser will
+insert one, by default AND. So this query::
+
+ render shading modeling
+
+is equivalent (by default) to::
+
+ render AND shading AND modeling
+
+See :doc:`customizing the default parser <parsing>` for information on how to
+change the default operator to OR.
+
+Group operators together with parentheses. For example to find documents that
+contain both ``render`` and ``shading``, or contain ``modeling``::
+
+ (render AND shading) OR modeling
+
+
+Fields
+======
+
+Find the term ``ivan`` in the ``name`` field::
+
+ name:ivan
+
+The ``field:`` prefix only sets the field for the term it directly precedes, so
+the query::
+
+ title:open sesame
+
+Will search for ``open`` in the ``title`` field and ``sesame`` in the *default*
+field.
+
+To apply a field prefix to multiple terms, group them with parentheses::
+
+ title:(open sesame)
+
+This is the same as::
+
+ title:open title:sesame
+
+Of course you can specify a field for phrases too::
+
+ title:"open sesame"
+
+
+Inexact terms
+=============
+
+Use "globs" (wildcard expressions using ``?`` to represent a single character
+and ``*`` to represent any number of characters) to match terms::
+
+ te?t test* *b?g*
+
+Note that a wildcard starting with ``?`` or ``*`` is very slow. Note also that
+these wildcards only match *individual terms*. For example, the query::
+
+ my*life
+
+will **not** match an indexed phrase like::
+
+ my so called life
+
+because those are four separate terms.
+
+
+Ranges
+======
+
+You can match a range of terms. For example, the following query will match
+documents containing terms in the lexical range from ``apple`` to ``bear``
+*inclusive*. For example, it will match documents containing ``azores`` and
+``be`` but not ``blur``::
+
+ [apple TO bear]
+
+This is very useful when you've stored, for example, dates in a lexically sorted
+format (i.e. YYYYMMDD)::
+
+ date:[20050101 TO 20090715]
+
+The range is normally *inclusive* (that is, the range will match all terms
+between the start and end term, *as well as* the start and end terms
+themselves). You can specify that one or both ends of the range are *exclusive*
+by using the ``{`` and/or ``}`` characters::
+
+ [0000 TO 0025}
+ {prefix TO suffix}
+
+You can also specify *open-ended* ranges by leaving out the start or end term::
+
+ [0025 TO]
+ {TO suffix}
+
+
+Boosting query elements
+=======================
+
+You can specify that certain parts of a query are more important for calculating
+the score of a matched document than others. For example, to specify that
+``ninja`` is twice as important as other words, and ``bear`` is half as
+important::
+
+ ninja^2 cowboy bear^0.5
+
+You can apply a boost to several terms using grouping parentheses::
+
+ (open sesame)^2.5 roc
+
+
+Making a term from literal text
+===============================
+
+If you need to include characters in a term that are normally treated specially
+by the parser, such as spaces, colons, or brackets, you can enclose the term
+in single quotes::
+
+ path:'MacHD:My Documents'
+ 'term with spaces'
+ title:'function()'
+
+
+
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
new file mode 100644
index 0000000..a0ffe51
--- /dev/null
+++ b/docs/source/quickstart.rst
@@ -0,0 +1,244 @@
+===========
+Quick start
+===========
+
+Whoosh is a library of classes and functions for indexing text and then searching the index.
+It allows you to develop custom search engines for your content. For example, if you were
+creating blogging software, you could use Whoosh to add a search function to allow users to
+search blog entries.
+
+
+A quick introduction
+====================
+
+::
+
+ >>> from whoosh.index import create_in
+ >>> from whoosh.fields import *
+ >>> schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)
+ >>> ix = create_in("indexdir", schema)
+ >>> writer = ix.writer()
+ >>> writer.add_document(title=u"First document", path=u"/a",
+ ... content=u"This is the first document we've added!")
+ >>> writer.add_document(title=u"Second document", path=u"/b",
+ ... content=u"The second one is even more interesting!")
+ >>> writer.commit()
+ >>> from whoosh.qparser import QueryParser
+ >>> with ix.searcher() as searcher:
+ ... query = QueryParser("content", ix.schema).parse("first")
+ ... results = searcher.search(query)
+ ... results[0]
+ ...
+ {"title": u"First document", "path": u"/a"}
+
+
+The ``Index`` and ``Schema`` objects
+====================================
+
+To begin using Whoosh, you need an *index object*. The first time you create
+an index, you must define the index's *schema*. The schema lists the *fields*
+in the index. A field is a piece of information for each document in the index,
+such as its title or text content. A field can be *indexed* (meaning it can
+be searched) and/or *stored* (meaning the value that gets indexed is returned
+with the results; this is useful for fields such as the title).
+
+This schema has two fields, "title" and "content"::
+
+ from whoosh.fields import Schema, TEXT
+
+ schema = Schema(title=TEXT, content=TEXT)
+
+You only need to do create the schema once, when you create the index. The
+schema is pickled and stored with the index.
+
+When you create the ``Schema`` object, you use keyword arguments to map field names
+to field types. The list of fields and their types defines what you are indexing
+and what's searchable. Whoosh comes with some very useful predefined field
+types, and you can easily create your own.
+
+:class:`whoosh.fields.ID`
+ This type simply indexes (and optionally stores) the entire value of the
+ field as a single unit (that is, it doesn't break it up into individual
+ words). This is useful for fields such as a file path, URL, date, category,
+ etc.
+
+:class:`whoosh.fields.STORED`
+ This field is stored with the document, but not indexed. This field type is
+ not indexed and not searchable. This is useful for document information you
+ want to display to the user in the search results.
+
+:class:`whoosh.fields.KEYWORD`
+ This type is designed for space- or comma-separated keywords. This type is
+ indexed and searchable (and optionally stored). To save space, it does not
+ support phrase searching.
+
+:class:`whoosh.fields.TEXT`
+ This type is for body text. It indexes (and optionally stores) the text and
+ stores term positions to allow phrase searching.
+
+:class:`whoosh.fields.NUMERIC`
+ This type is for numbers. You can store integers or floating point numbers.
+
+:class:`whoosh.fields.BOOLEAN`
+ This type is for boolean (true/false) values.
+
+:class:`whoosh.fields.DATETIME`
+ This type is for ``datetime`` objects. See :doc:`dates` for more
+ information.
+
+:class:`whoosh.fields.NGRAM` and :class:`whoosh.fields.NGRAMWORDS`
+ These types break the field text or individual terms into N-grams.
+ See :doc:`ngrams` for more information.
+
+(As a shortcut, if you don't need to pass any arguments to the field type, you
+can just give the class name and Whoosh will instantiate the object for you.) ::
+
+ from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT
+
+ schema = Schema(title=TEXT(stored=True), content=TEXT,
+ path=ID(stored=True), tags=KEYWORD, icon=STORED)
+
+See :doc:`schema` for more information.
+
+Once you have the schema, you can create an index using the ``create_in``
+function::
+
+ import os.path
+ from whoosh.index import create_in
+
+ if not os.path.exists("index"):
+ os.mkdir("index")
+ ix = create_in("index", schema)
+
+(At a low level, this creates a *Storage* object to contain the index. A
+``Storage`` object represents that medium in which the index will be stored.
+Usually this will be ``FileStorage``, which stores the index as a set of files
+in a directory.)
+
+After you've created an index, you can open it using the ``open_dir``
+convenience function::
+
+ from whoosh.index import open_dir
+
+ ix = open_dir("index")
+
+
+The ``IndexWriter`` object
+==========================
+
+OK, so we've got an ``Index`` object, now we can start adding documents. The
+``writer()`` method of the ``Index`` object returns an ``IndexWriter`` object that lets
+you add documents to the index. The IndexWriter's ``add_document(**kwargs)``
+method accepts keyword arguments where the field name is mapped to a value::
+
+ writer = ix.writer()
+ writer.add_document(title=u"My document", content=u"This is my document!",
+ path=u"/a", tags=u"first short", icon=u"/icons/star.png")
+ writer.add_document(title=u"Second try", content=u"This is the second example.",
+ path=u"/b", tags=u"second short", icon=u"/icons/sheep.png")
+ writer.add_document(title=u"Third time's the charm", content=u"Examples are many.",
+ path=u"/c", tags=u"short", icon=u"/icons/book.png")
+ writer.commit()
+
+Two important notes:
+
+* You don't have to fill in a value for every field. Whoosh doesn't care if you
+ leave out a field from a document.
+
+* Indexed text fields must be passed a unicode value. Fields that are stored
+ but not indexed (``STORED`` field type) can be passed any pickle-able object.
+
+If you have a text field that is both indexed and stored, you can index a
+unicode value but store a different object if necessary (it's usually not, but
+sometimes this is really useful) using this trick::
+
+ writer.add_document(title=u"Title to be indexed", _stored_title=u"Stored title")
+
+Calling commit() on the ``IndexWriter`` saves the added documents to the index::
+
+ writer.commit()
+
+See :doc:`indexing` for more information.
+
+Once your documents are committed to the index, you can search for them.
+
+
+The ``Searcher`` object
+=======================
+
+To begin searching the index, we'll need a ``Searcher`` object::
+
+ searcher = ix.searcher()
+
+You'll usually want to open the searcher using a ``with`` statement so the
+searcher is automatically closed when you're done with it (searcher objects
+represent a number of open files, so if you don't explicitly close them and the
+system is slow to collect them, you can run out of file handles)::
+
+ with ix.searcher() as searcher:
+ ...
+
+This is of course equivalent to::
+
+ try:
+ searcher = ix.searcher()
+ ...
+ finally:
+ searcher.close()
+
+The Searcher's ``search()`` method takes a *Query object*. You can construct
+query objects directly or use a query parser to parse a query string.
+
+For example, this query would match documents that contain both "apple" and
+"bear" in the "content" field::
+
+ # Construct query objects directly
+
+ from whoosh.query import *
+ myquery = And([Term("content", u"apple"), Term("content", "bear")])
+
+To parse a query string, you can use the default query parser in the ``qparser``
+module. The first argument to the ``QueryParser`` constructor is the default
+field to search. This is usually the "body text" field. The second optional
+argument is a schema to use to understand how to parse the fields::
+
+ # Parse a query string
+
+ from whoosh.qparser import QueryParser
+ parser = QueryParser("content", ix.schema)
+ myquery = parser.parse(querystring)
+
+Once you have a ``Searcher`` and a query object, you can use the ``Searcher``'s
+``search()`` method to run the query and get a ``Results`` object::
+
+ >>> results = searcher.search(myquery)
+ >>> print(len(results))
+ 1
+ >>> print(results[0])
+ {"title": "Second try", "path": "/b", "icon": "/icons/sheep.png"}
+
+The default ``QueryParser`` implements a query language very similar to
+Lucene's. It lets you connect terms with ``AND`` or ``OR``, eleminate terms with
+``NOT``, group terms together into clauses with parentheses, do range, prefix,
+and wilcard queries, and specify different fields to search. By default it joins
+clauses together with ``AND`` (so by default, all terms you specify must be in
+the document for the document to match)::
+
+ >>> print(parser.parse(u"render shade animate"))
+ And([Term("content", "render"), Term("content", "shade"), Term("content", "animate")])
+
+ >>> print(parser.parse(u"render OR (title:shade keyword:animate)"))
+ Or([Term("content", "render"), And([Term("title", "shade"), Term("keyword", "animate")])])
+
+ >>> print(parser.parse(u"rend*"))
+ Prefix("content", "rend")
+
+Whoosh includes extra features for dealing with search results, such as
+
+* Sorting results by the value of an indexed field, instead of by relelvance.
+* Highlighting the search terms in excerpts from the original documents.
+* Expanding the query terms based on the top few documents found.
+* Paginating the results (e.g. "Showing results 1-20, page 1 of 4").
+
+See :doc:`searching` for more information.
+
diff --git a/docs/source/recipes.rst b/docs/source/recipes.rst
new file mode 100644
index 0000000..94983bf
--- /dev/null
+++ b/docs/source/recipes.rst
@@ -0,0 +1,229 @@
+==============
+Whoosh recipes
+==============
+
+General
+=======
+
+Get the stored fields for a document from the document number
+-------------------------------------------------------------
+::
+
+ stored_fields = searcher.stored_fields(docnum)
+
+
+Analysis
+========
+
+Eliminate words shorter/longer than N
+-------------------------------------
+
+Use a :class:`~whoosh.analysis.StopFilter` and the ``minsize`` and ``maxsize``
+keyword arguments. If you just want to filter based on size and not common
+words, set the ``stoplist`` to ``None``::
+
+ sf = analysis.StopFilter(stoplist=None, minsize=2, maxsize=40)
+
+
+Allow optional case-sensitive searches
+--------------------------------------
+
+A quick and easy way to do this is to index both the original and lowercased
+versions of each word. If the user searches for an all-lowercase word, it acts
+as a case-insensitive search, but if they search for a word with any uppercase
+characters, it acts as a case-sensitive search::
+
+ class CaseSensitivizer(analysis.Filter):
+ def __call__(self, tokens):
+ for t in tokens:
+ yield t
+ if t.mode == "index":
+ low = t.text.lower()
+ if low != t.text:
+ t.text = low
+ yield t
+
+ ana = analysis.RegexTokenizer() | CaseSensitivizer()
+ [t.text for t in ana("The new SuperTurbo 5000", mode="index")]
+ # ["The", "the", "new", "SuperTurbo", "superturbo", "5000"]
+
+
+Searching
+=========
+
+Find every document
+-------------------
+::
+
+ myquery = query.Every()
+
+
+iTunes-style search-as-you-type
+-------------------------------
+
+Use the :class:`whoosh.analysis.NgramWordAnalyzer` as the analyzer for the
+field you want to search as the user types. You can save space in the index by
+turning off positions in the field using ``phrase=False``, since phrase
+searching on N-gram fields usually doesn't make much sense::
+
+ # For example, to search the "title" field as the user types
+ analyzer = analysis.NgramWordAnalyzer()
+ title_field = fields.TEXT(analyzer=analyzer, phrase=False)
+ schema = fields.Schema(title=title_field)
+
+See the documentation for the :class:`~whoosh.analysis.NgramWordAnalyzer` class
+for information on the available options.
+
+
+Shortcuts
+=========
+
+Look up documents by a field value
+----------------------------------
+::
+
+ # Single document (unique field value)
+ stored_fields = searcher.document(id="bacon")
+
+ # Multiple documents
+ for stored_fields in searcher.documents(tag="cake"):
+ ...
+
+
+Sorting and scoring
+===================
+
+See :doc:`facets`.
+
+
+Score results based on the position of the matched term
+-------------------------------------------------------
+
+The following scoring function uses the position of the first occurance of a
+term in each document to calculate the score, so documents with the given term
+earlier in the document will score higher::
+
+ from whoosh import scoring
+
+ def pos_score_fn(searcher, fieldname, text, matcher):
+ poses = matcher.value_as("positions")
+ return 1.0 / (poses[0] + 1)
+
+ pos_weighting = scoring.FunctionWeighting(pos_score_fn)
+ with myindex.searcher(weighting=pos_weighting) as s:
+ ...
+
+
+Results
+=======
+
+How many hits were there?
+-------------------------
+
+The number of *scored* hits::
+
+ found = results.scored_length()
+
+Depending on the arguments to the search, the exact total number of hits may be
+known::
+
+ if results.has_exact_length():
+ print("Scored", found, "of exactly", len(results), "documents")
+
+Usually, however, the exact number of documents that match the query is not
+known, because the searcher can skip over blocks of documents it knows won't
+show up in the "top N" list. If you call ``len(results)`` on a query where the
+exact length is unknown, Whoosh will run an unscored version of the original
+query to get the exact number. This is faster than the scored search, but may
+still be noticeably slow on very large indexes or complex queries.
+
+As an alternative, you might display the *estimated* total hits::
+
+ found = results.scored_length()
+ if results.has_exact_length():
+ print("Scored", found, "of exactly", len(results), "documents")
+ else:
+ low = results.estimated_min_length()
+ high = results.estimated_length()
+
+ print("Scored", found, "of between", low, "and", high, "documents")
+
+
+Which terms matched in each hit?
+--------------------------------
+::
+
+ # Use terms=True to record term matches for each hit
+ results = searcher.search(myquery, terms=True)
+
+ for hit in results:
+ # Which terms matched in this hit?
+ print("Matched:", hit.matched_terms())
+
+ # Which terms from the query didn't match in this hit?
+ print("Didn't match:", myquery.all_terms() - hit.matched_terms())
+
+
+Global information
+==================
+
+How many documents are in the index?
+------------------------------------
+::
+
+ # Including documents that are deleted but not yet optimized away
+ numdocs = searcher.doc_count_all()
+
+ # Not including deleted documents
+ numdocs = searcher.doc_count()
+
+
+What fields are in the index?
+-----------------------------
+::
+
+ return myindex.schema.names()
+
+
+Is term X in the index?
+-----------------------
+::
+
+ return ("content", "wobble") in searcher
+
+
+How many times does term X occur in the index?
+----------------------------------------------
+::
+
+ # Number of times content:wobble appears in all documents
+ freq = searcher.frequency("content", "wobble")
+
+ # Number of documents containing content:wobble
+ docfreq = searcher.doc_frequency("content", "wobble")
+
+
+Is term X in document Y?
+------------------------
+::
+
+ # Check if the "content" field of document 500 contains the term "wobble"
+
+ # Without term vectors, skipping through list...
+ postings = searcher.postings("content", "wobble")
+ postings.skip_to(500)
+ return postings.id() == 500
+
+ # ...or the slower but easier way
+ docset = set(searcher.postings("content", "wobble").all_ids())
+ return 500 in docset
+
+ # If field has term vectors, skipping through list...
+ vector = searcher.vector(500, "content")
+ vector.skip_to("wobble")
+ return vector.id() == "wobble"
+
+ # ...or the slower but easier way
+ wordset = set(searcher.vector(500, "content").all_ids())
+ return "wobble" in wordset
+
diff --git a/docs/source/releases/0_3.rst b/docs/source/releases/0_3.rst
new file mode 100644
index 0000000..780d82e
--- /dev/null
+++ b/docs/source/releases/0_3.rst
@@ -0,0 +1,61 @@
+========================
+Whoosh 0.3 release notes
+========================
+
+* Major improvements to reading/writing of postings and query performance.
+
+* Changed default post limit (run size) from 4 MB to 32 MB.
+
+* Finished migrating backend-specific code into ``whoosh.filedb`` package.
+
+* Moved formats from whoosh.fields module into new whoosh.formats module.
+
+* DocReader and TermReader classes combined into new IndexReader interface.
+ You can get an IndexReader implementation by calling Index.reader().
+ Searcher is now a wrapper around an IndexReader.
+
+* Range query object changed, with new signature and new syntax in the default
+ query parser. Now you can use ``[start TO end]`` in the query parser for an
+ inclusive range, and ``{start TO end}`` for an exclusive range. You can also
+ mix the delimiters, for example ``[start TO end}`` for a range with an
+ inclusive start but exclusive end term.
+
+* Added experimental DATETIME field type lets you pass a
+ ``datetime.datetime`` object as a field value to ``add_document``::
+
+ from whoosh.fields import Schema, ID, DATETIME
+ from whoosh.filedb.filestore import RamStorage
+ from datetime import datetime
+
+ schema = Schema(id=ID, date=DATETIME)
+ storage = RamStorage()
+ ix = storage.create_index(schema)
+ w = ix.writer()
+ w.add_document(id=u"A", date=datetime.now())
+ w.close()
+
+ Internally, the DATETIME field indexes the datetime object as text using
+ the format (4 digit year + 2 digit month + 2 digit day + 'T' + 2 digit hour +
+ 2 digit minute + 2 digit second + 6 digit microsecond), for example
+ ``20090817T160203109000``.
+
+* The default query parser now lets you use quoted strings in prefix and range
+ queries, e.g. ``["2009-05" TO "2009-12"]``, ``"alfa/bravo"*``, making it
+ easier to work with terms containing special characters.
+
+* ``DocReader.vector_as(docnum, fieldid, astype)`` is now
+ ``IndexReader.vector_as(astype, docnum, fieldid)`` (i.e. the astype argument
+ has moved from the last to the first argument), e.g.
+ ``v = ixreader.vector_as("frequency", 102, "content")``.
+
+* Added whoosh.support.charset for translating Sphinx charset table files.
+
+* Added whoosh.analysis.CharsetTokenizer and CharsetFilter to enable case and
+ accent folding.
+
+* Added experimental ``whoosh.ramdb`` in-memory backend.
+
+* Added experimental ``whoosh.query.FuzzyTerm`` query type.
+
+* Added ``whoosh.lang.wordnet`` module containing ``Thesaurus`` object for using
+ WordNet synonym database.
diff --git a/docs/source/releases/1_0.rst b/docs/source/releases/1_0.rst
new file mode 100644
index 0000000..524d1fc
--- /dev/null
+++ b/docs/source/releases/1_0.rst
@@ -0,0 +1,482 @@
+========================
+Whoosh 1.x release notes
+========================
+
+Whoosh 1.8.3
+============
+
+Whoosh 1.8.3 contains important bugfixes and new functionality. Thanks to all
+the mailing list and BitBucket users who helped with the fixes!
+
+Fixed a bad ``Collector`` bug where the docset of a Results object did not match
+the actual results.
+
+You can now pass a sequence of objects to a keyword argument in ``add_document``
+and ``update_document`` (currently this will not work for unique fields in
+``update_document``). This is useful for non-text fields such as ``DATETIME``
+and ``NUMERIC``, allowing you to index multiple dates/numbers for a document::
+
+ writer.add_document(shoe=u"Saucony Kinvara", sizes=[10.0, 9.5, 12])
+
+This version reverts to using the CDB hash function for hash files instead of
+Python's ``hash()`` because the latter is not meant to be stored externally.
+This change maintains backwards compatibility with old files.
+
+The ``Searcher.search`` method now takes a ``mask`` keyword argument. This is
+the opposite of the ``filter`` argument. Where the ``filter`` specifies the
+set of documents that can appear in the results, the ``mask`` specifies a
+set of documents that must not appear in the results.
+
+Fixed performance problems in ``Searcher.more_like``. This method now also
+takes a ``filter`` keyword argument like ``Searcher.search``.
+
+Improved documentation.
+
+
+Whoosh 1.8.2
+============
+
+Whoosh 1.8.2 fixes some bugs, including a mistyped signature in
+Searcher.more_like and a bad bug in Collector that could screw up the
+ordering of results given certain parameters.
+
+
+Whoosh 1.8.1
+============
+
+Whoosh 1.8.1 includes a few recent bugfixes/improvements:
+
+- ListMatcher.skip_to_quality() wasn't returning an integer, resulting
+ in a "None + int" error.
+
+- Fixed locking and memcache sync bugs in the Google App Engine storage
+ object.
+
+- MultifieldPlugin wasn't working correctly with groups.
+
+ - The binary matcher trees of Or and And are now generated using a
+ Huffman-like algorithm instead perfectly balanced. This gives a
+ noticeable speed improvement because less information has to be passed
+ up/down the tree.
+
+
+Whoosh 1.8
+==========
+
+This release relicensed the Whoosh source code under the Simplified BSD (A.K.A.
+"two-clause" or "FreeBSD") license. See LICENSE.txt for more information.
+
+
+Whoosh 1.7.7
+============
+
+Setting a TEXT field to store term vectors is now much easier. Instead of
+having to pass an instantiated whoosh.formats.Format object to the vector=
+keyword argument, you can pass True to automatically use the same format and
+analyzer as the inverted index. Alternatively, you can pass a Format subclass
+and Whoosh will instantiate it for you.
+
+For example, to store term vectors using the same settings as the inverted
+index (Positions format and StandardAnalyzer)::
+
+ from whoosh.fields import Schema, TEXT
+
+ schema = Schema(content=TEXT(vector=True))
+
+To store term vectors that use the same analyzer as the inverted index
+(StandardAnalyzer by default) but only store term frequency::
+
+ from whoosh.formats import Frequency
+
+ schema = Schema(content=TEXT(vector=Frequency))
+
+Note that currently the only place term vectors are used in Whoosh is keyword
+extraction/more like this, but they can be useful for expert users with custom
+code.
+
+Added :meth:`whoosh.searching.Searcher.more_like` and
+:meth:`whoosh.searching.Hit.more_like_this` methods, as shortcuts for doing
+keyword extraction yourself. Return a Results object.
+
+"python setup.py test" works again, as long as you have nose installed.
+
+The :meth:`whoosh.searching.Searcher.sort_query_using` method lets you sort documents matching a given query using an arbitrary function. Note that like "complex" searching with the Sorter object, this can be slow on large multi-segment indexes.
+
+
+Whoosh 1.7
+==========
+
+You can once again perform complex sorting of search results (that is, a sort
+with some fields ascending and some fields descending).
+
+You can still use the ``sortedby`` keyword argument to
+:meth:`whoosh.searching.Searcher.search` to do a simple sort (where all fields
+are sorted in the same direction), or you can use the new
+:class:`~whoosh.sorting.Sorter` class to do a simple or complex sort::
+
+ searcher = myindex.searcher()
+ sorter = searcher.sorter()
+ # Sort first by the group field, ascending
+ sorter.add_field("group")
+ # Then by the price field, descending
+ sorter.add_field("price", reverse=True)
+ # Get the Results
+ results = sorter.sort_query(myquery)
+
+See the documentation for the :class:`~whoosh.sorting.Sorter` class for more
+information. Bear in mind that complex sorts will be much slower on large
+indexes because they can't use the per-segment field caches.
+
+You can now get highlighted snippets for a hit automatically using
+:meth:`whoosh.searching.Hit.highlights`::
+
+ results = searcher.search(myquery, limit=20)
+ for hit in results:
+ print hit["title"]
+ print hit.highlights("content")
+
+See :meth:`whoosh.searching.Hit.highlights` for more information.
+
+Added the ability to filter search results so that only hits in a Results
+set, a set of docnums, or matching a query are returned. The filter is
+cached on the searcher.
+
+ # Search within previous results
+ newresults = searcher.search(newquery, filter=oldresults)
+
+ # Search within the "basics" chapter
+ results = searcher.search(userquery, filter=query.Term("chapter", "basics"))
+
+You can now specify a time limit for a search. If the search does not finish
+in the given time, a :class:`whoosh.searching.TimeLimit` exception is raised,
+but you can still retrieve the partial results from the collector. See the
+``timelimit`` and ``greedy`` arguments in the
+:class:`whoosh.searching.Collector` documentation.
+
+Added back the ability to set :class:`whoosh.analysis.StemFilter` to use an
+unlimited cache. This is useful for one-shot batch indexing (see
+:doc:`../batch`).
+
+The ``normalize()`` method of the ``And`` and ``Or`` queries now merges
+overlapping range queries for more efficient queries.
+
+Query objects now have ``__hash__`` methods allowing them to be used as
+dictionary keys.
+
+The API of the highlight module has changed slightly. Most of the functions
+in the module have been converted to classes. However, most old code should
+still work. The ``NullFragmeter`` is now called ``WholeFragmenter``, but the
+old name is still available as an alias.
+
+Fixed MultiPool so it won't fill up the temp directory with job files.
+
+Fixed a bug where Phrase query objects did not use their boost factor.
+
+Fixed a bug where a fieldname after an open parenthesis wasn't parsed
+correctly. The change alters the semantics of certain parsing "corner cases"
+(such as ``a:b:c:d``).
+
+
+Whoosh 1.6
+==========
+
+The ``whoosh.writing.BatchWriter`` class is now called
+:class:`whoosh.writing.BufferedWriter`. It is similar to the old ``BatchWriter``
+class but allows you to search and update the buffered documents as well as the
+documents that have been flushed to disk::
+
+ writer = writing.BufferedWriter(myindex)
+
+ # You can update (replace) documents in RAM without having to commit them
+ # to disk
+ writer.add_document(path="/a", text="Hi there")
+ writer.update_document(path="/a", text="Hello there")
+
+ # Search committed and uncommited documents by getting a searcher from the
+ # writer instead of the index
+ searcher = writer.searcher()
+
+(BatchWriter is still available as an alias for backwards compatibility.)
+
+The :class:`whoosh.qparser.QueryParser` initialization method now requires a
+schema as the second argument. Previously the default was to create a
+``QueryParser`` without a schema, which was confusing::
+
+ qp = qparser.QueryParser("content", myindex.schema)
+
+The :meth:`whoosh.searching.Searcher.search` method now takes a ``scored``
+keyword. If you search with ``scored=False``, the results will be in "natural"
+order (the order the documents were added to the index). This is useful when
+you don't need scored results but want the convenience of the Results object.
+
+Added the :class:`whoosh.qparser.GtLtPlugin` parser plugin to allow greater
+than/less as an alternative syntax for ranges::
+
+ count:>100 tag:<=zebra date:>='29 march 2001'
+
+Added the ability to define schemas declaratively, similar to Django models::
+
+ from whoosh import index
+ from whoosh.fields import SchemaClass, ID, KEYWORD, STORED, TEXT
+
+ class MySchema(SchemaClass):
+ uuid = ID(stored=True, unique=True)
+ path = STORED
+ tags = KEYWORD(stored=True)
+ content = TEXT
+
+ index.create_in("indexdir", MySchema)
+
+Whoosh 1.6.2: Added :class:`whoosh.searching.TermTrackingCollector` which tracks
+which part of the query matched which documents in the final results.
+
+Replaced the unbounded cache in :class:`whoosh.analysis.StemFilter` with a
+bounded LRU (least recently used) cache. This will make stemming analysis
+slightly slower but prevent it from eating up too much memory over time.
+
+Added a simple :class:`whoosh.analysis.PyStemmerFilter` that works when the
+py-stemmer library is installed::
+
+ ana = RegexTokenizer() | PyStemmerFilter("spanish")
+
+The estimation of memory usage for the ``limitmb`` keyword argument to
+``FileIndex.writer()`` is more accurate, which should help keep memory usage
+memory usage by the sorting pool closer to the limit.
+
+The ``whoosh.ramdb`` package was removed and replaced with a single
+``whoosh.ramindex`` module.
+
+Miscellaneous bug fixes.
+
+
+Whoosh 1.5
+==========
+
+.. note::
+ Whoosh 1.5 is incompatible with previous indexes. You must recreate
+ existing indexes with Whoosh 1.5.
+
+Fixed a bug where postings were not portable across different endian platforms.
+
+New generalized field cache system, using per-reader caches, for much faster
+sorting and faceting of search results, as well as much faster multi-term (e.g.
+prefix and wildcard) and range queries, especially for large indexes and/or
+indexes with multiple segments.
+
+Changed the faceting API. See :doc:`../facets`.
+
+Faster storage and retrieval of posting values.
+
+Added per-field ``multitoken_query`` attribute to control how the query parser
+deals with a "term" that when analyzed generates multiple tokens. The default
+value is `"first"` which throws away all but the first token (the previous
+behavior). Other possible values are `"and"`, `"or"`, or `"phrase"`.
+
+Added :class:`whoosh.analysis.DoubleMetaphoneFilter`,
+:class:`whoosh.analysis.SubstitutionFilter`, and
+:class:`whoosh.analysis.ShingleFilter`.
+
+Added :class:`whoosh.qparser.CopyFieldPlugin`.
+
+Added :class:`whoosh.query.Otherwise`.
+
+Generalized parsing of operators (such as OR, AND, NOT, etc.) in the query
+parser to make it easier to add new operators. In intend to add a better API
+for this in a future release.
+
+Switched NUMERIC and DATETIME fields to use more compact on-disk
+representations of numbers.
+
+Fixed a bug in the porter2 stemmer when stemming the string `"y"`.
+
+Added methods to :class:`whoosh.searching.Hit` to make it more like a `dict`.
+
+Short posting lists (by default, single postings) are inline in the term file
+instead of written to the posting file for faster retrieval and a small saving
+in disk space.
+
+
+Whoosh 1.3
+==========
+
+Whoosh 1.3 adds a more efficient DATETIME field based on the new tiered NUMERIC
+field, and the DateParserPlugin. See :doc:`../dates`.
+
+
+Whoosh 1.2
+==========
+
+Whoosh 1.2 adds tiered indexing for NUMERIC fields, resulting in much faster
+range queries on numeric fields.
+
+
+Whoosh 1.0
+==========
+
+Whoosh 1.0 is a major milestone release with vastly improved performance and
+several useful new features.
+
+*The index format of this version is not compatibile with indexes created by
+previous versions of Whoosh*. You will need to reindex your data to use this
+version.
+
+Orders of magnitude faster searches for common terms. Whoosh now uses
+optimizations similar to those in Xapian to skip reading low-scoring postings.
+
+Faster indexing and ability to use multiple processors (via ``multiprocessing``
+module) to speed up indexing.
+
+Flexible Schema: you can now add and remove fields in an index with the
+:meth:`whoosh.writing.IndexWriter.add_field` and
+:meth:`whoosh.writing.IndexWriter.remove_field` methods.
+
+New hand-written query parser based on plug-ins. Less brittle, more robust,
+more flexible, and easier to fix/improve than the old pyparsing-based parser.
+
+On-disk formats now use 64-bit disk pointers allowing files larger than 4 GB.
+
+New :class:`whoosh.searching.Facets` class efficiently sorts results into
+facets based on any criteria that can be expressed as queries, for example
+tags or price ranges.
+
+New :class:`whoosh.writing.BatchWriter` class automatically batches up
+individual ``add_document`` and/or ``delete_document`` calls until a certain
+number of calls or a certain amount of time passes, then commits them all at
+once.
+
+New :class:`whoosh.analysis.BiWordFilter` lets you create bi-word indexed
+fields a possible alternative to phrase searching.
+
+Fixed bug where files could be deleted before a reader could open them in
+threaded situations.
+
+New :class:`whoosh.analysis.NgramFilter` filter,
+:class:`whoosh.analysis.NgramWordAnalyzer` analyzer, and
+:class:`whoosh.fields.NGRAMWORDS` field type allow producing n-grams from
+tokenized text.
+
+Errors in query parsing now raise a specific ``whoosh.qparse.QueryParserError``
+exception instead of a generic exception.
+
+Previously, the query string ``*`` was optimized to a
+:class:`whoosh.query.Every` query which matched every document. Now the
+``Every`` query only matches documents that actually have an indexed term from
+the given field, to better match the intuitive sense of what a query string like
+``tag:*`` should do.
+
+New :meth:`whoosh.searching.Searcher.key_terms_from_text` method lets you
+extract key words from arbitrary text instead of documents in the index.
+
+Previously the :meth:`whoosh.searching.Searcher.key_terms` and
+:meth:`whoosh.searching.Results.key_terms` methods required that the given
+field store term vectors. They now also work if the given field is stored
+instead. They will analyze the stored string into a term vector on-the-fly.
+The field must still be indexed.
+
+
+User API changes
+================
+
+The default for the ``limit`` keyword argument to
+:meth:`whoosh.searching.Searcher.search` is now ``10``. To return all results
+in a single ``Results`` object, use ``limit=None``.
+
+The ``Index`` object no longer represents a snapshot of the index at the time
+the object was instantiated. Instead it always represents the index in the
+abstract. ``Searcher`` and ``IndexReader`` objects obtained from the
+``Index`` object still represent the index as it was at the time they were
+created.
+
+Because the ``Index`` object no longer represents the index at a specific
+version, several methods such as ``up_to_date`` and ``refresh`` were removed
+from its interface. The Searcher object now has
+:meth:`~whoosh.searching.Searcher.last_modified`,
+:meth:`~whoosh.searching.Searcher.up_to_date`, and
+:meth:`~whoosh.searching.Searcher.refresh` methods similar to those that used to
+be on ``Index``.
+
+The document deletion and field add/remove methods on the ``Index`` object now
+create a writer behind the scenes to accomplish each call. This means they write
+to the index immediately, so you don't need to call ``commit`` on the ``Index``.
+Also, it will be much faster if you need to call them multiple times to create
+your own writer instead::
+
+ # Don't do this
+ for id in my_list_of_ids_to_delete:
+ myindex.delete_by_term("id", id)
+ myindex.commit()
+
+ # Instead do this
+ writer = myindex.writer()
+ for id in my_list_of_ids_to_delete:
+ writer.delete_by_term("id", id)
+ writer.commit()
+
+The ``postlimit`` argument to ``Index.writer()`` has been changed to
+``postlimitmb`` and is now expressed in megabytes instead of bytes::
+
+ writer = myindex.writer(postlimitmb=128)
+
+Instead of having to import ``whoosh.filedb.filewriting.NO_MERGE`` or
+``whoosh.filedb.filewriting.OPTIMIZE`` to use as arguments to ``commit()``, you
+can now simply do the following::
+
+ # Do not merge segments
+ writer.commit(merge=False)
+
+ # or
+
+ # Merge all segments
+ writer.commit(optimize=True)
+
+The ``whoosh.postings`` module is gone. The ``whoosh.matching`` module contains
+classes for posting list readers.
+
+Whoosh no longer maps field names to numbers for internal use or writing to
+disk. Any low-level method that accepted field numbers now accept field names
+instead.
+
+Custom Weighting implementations that use the ``final()`` method must now
+set the ``use_final`` attribute to ``True``::
+
+ from whoosh.scoring import BM25F
+
+ class MyWeighting(BM25F):
+ use_final = True
+
+ def final(searcher, docnum, score):
+ return score + docnum * 10
+
+This disables the new optimizations, forcing Whoosh to score every matching
+document.
+
+:class:`whoosh.writing.AsyncWriter` now takes an :class:`whoosh.index.Index`
+object as its first argument, not a callable. Also, the keyword arguments to
+pass to the index's ``writer()`` method should now be passed as a dictionary
+using the ``writerargs`` keyword argument.
+
+Whoosh now stores per-document field length using an approximation rather than
+exactly. For low numbers the approximation is perfectly accurate, while high
+numbers will be approximated less accurately.
+
+The ``doc_field_length`` method on searchers and readers now takes a second
+argument representing the default to return if the given document and field
+do not have a length (i.e. the field is not scored or the field was not
+provided for the given document).
+
+The :class:`whoosh.analysis.StopFilter` now has a ``maxsize`` argument as well
+as a ``minsize`` argument to its initializer. Analyzers that use the
+``StopFilter`` have the ``maxsize`` argument in their initializers now also.
+
+The interface of :class:`whoosh.writing.AsyncWriter` has changed.
+
+
+Misc
+====
+
+* Because the file backend now writes 64-bit disk pointers and field names
+ instead of numbers, the size of an index on disk will grow compared to
+ previous versions.
+
+* Unit tests should no longer leave directories and files behind.
+
diff --git a/docs/source/releases/2_0.rst b/docs/source/releases/2_0.rst
new file mode 100644
index 0000000..3978460
--- /dev/null
+++ b/docs/source/releases/2_0.rst
@@ -0,0 +1,333 @@
+========================
+Whoosh 2.x release notes
+========================
+
+Whoosh 2.7
+==========
+
+* Removed on-disk word graph implementation of spell checking in favor of much
+ simpler and faster FSA implementation over the term file.
+
+* Many bug fixes.
+
+* Removed backwards compatibility with indexes created by versions prior to
+ 2.5. You may need to re-index if you are using an old index that hasn't been
+ updated.
+
+* This is the last 2.x release before a major overhaul that will break backwards
+ compatibility.
+
+
+Whoosh 2.5
+==========
+
+* Whoosh 2.5 will read existing indexes, but segments created by 2.5 will not
+ be readable by older versions of Whoosh.
+
+* As a replacement for field caches to speed up sorting, Whoosh now supports
+ adding a ``sortable=True`` keyword argument to fields. This makes Whoosh store
+ a sortable representation of the field's values in a "column" format
+ (which associates a "key" value with each document). This is more robust,
+ efficient, and customizable than the old behavior.
+ You should now specify ``sortable=True`` on fields that you plan on using to
+ sort or group search results.
+
+ (You can still sort/group on fields that don't have ``sortable=True``,
+ however it will use more RAM and be slower as Whoosh caches the field values
+ in memory.)
+
+ Fields that use ``sortable=True`` can avoid specifying ``stored=True``. The
+ field's value will still be available on ``Hit`` objects (the value will be
+ retrieved from the column instead of from the stored fields). This may
+ actually be faster for certain types of values.
+
+* Whoosh will now detect common types of OR queries and use optimized read-ahead
+ matchers to speed them up by several times.
+
+* Whoosh now includes pure-Python implementations of the Snowball stemmers and
+ stop word lists for various languages adapted from NLTK. These are available
+ through the :class:`whoosh.analysis.LanguageAnalyzer` analyzer or through the
+ ``lang=`` keyword argument to the
+ :class:`~whoosh.fields.TEXT` field.
+
+* You can now use the
+ :meth:`whoosh.filedb.filestore.Storage.create()` and
+ :meth:`whoosh.filedb.filestore.Storage.destory()`
+ methods as a consistent API to set up and tear down different types of
+ storage.
+
+* Many bug fixes and speed improvements.
+
+* Switched unit tests to use ``py.test`` instead of ``nose``.
+
+* Removed obsolete ``SpellChecker`` class.
+
+
+Whoosh 2.4
+==========
+
+* By default, Whoosh now assembles the individual files of a segment into a
+ single file when committing. This has a small performance penalty but solves
+ a problem where Whoosh can keep too many files open. Whoosh is also now
+ smarter about using mmap.
+
+* Added functionality to index and search hierarchical documents. See
+ :doc:`/nested`.
+
+* Rewrote the Directed Acyclic Word Graph implementation (used in spell
+ checking) to be faster and more space-efficient. Word graph files created by
+ previous versions will be ignored, meaning that spell checking may become
+ slower unless/until you replace the old segments (for example, by
+ optimizing).
+
+* Rewrote multiprocessing indexing to be faster and simpler. You can now
+ do ``myindex.writer(procs=n)`` to get a multiprocessing writer, or
+ ``myindex.writer(procs=n, multisegment=True)`` to get a multiprocessing
+ writer that leaves behind multiple segments, like the old MultiSegmentWriter.
+ (``MultiSegmentWriter`` is still available as a function that returns the
+ new class.)
+
+* When creating ``Term`` query objects for special fields (e.g. NUMERIC or
+ BOOLEAN), you can now use the field's literal type instead of a string as the
+ second argument, for example ``Term("num", 20)`` or ``Term("bool", True)``.
+ (This change may cause problems interacting with functions that expect
+ query objects to be pure textual, such as spell checking.)
+
+* All writing to and reading from on-disk indexes is now done through "codec"
+ objects. This architecture should make it easier to add optional or
+ experimental features, and maintain backwards compatibility.
+
+* Fixes issues #75, #137, #206, #213, #215, #219, #223, #226, #230, #233, #238,
+ #239, #240, #241, #243, #244, #245, #252, #253, and other bugs. Thanks to
+ Thomas Waldmann and Alexei Gousev for the help!
+
+
+Whoosh 2.3.2
+============
+
+* Fixes bug in BM25F scoring function, leading to increased precision in search
+ results.
+
+* Fixes issues #203, #205, #206, #208, #209, #212.
+
+
+Whoosh 2.3.1
+============
+
+* Fixes issue #200.
+
+
+Whoosh 2.3
+==========
+
+* Added a :class:`whoosh.query.Regex` term query type, similar to
+ :class:`whoosh.query.Wildcard`. The parser does not allow regex term queries
+ by default. You need to add the :class:`whoosh.qparser.RegexPlugin` plugin.
+ After you add the plugin, you can use ``r"expression"`` query syntax for
+ regular expression term queries. For example, ``r"foo.*bar"``.
+
+* Added the :class:`whoosh.qparser.PseudoFieldPlugin` parser plugin. This
+ plugin lets you create "pseudo-fields" that run a transform function on
+ whatever query syntax the user applies the field to. This is fairly advanced
+ functionality right now; I'm trying to think of ways to make its power easier
+ to access.
+
+* The documents in the lists in the dictionary returned by ``Results.groups()``
+ by default are now in the same relative order as in the results. This makes
+ it much easier to display the "top N" results in each category, for example.
+
+* The ``groupids`` keyword argument to ``Searcher.search`` has been removed.
+ Instead you can now pass a :class:`whoosh.sorting.FacetMap` object to the
+ ``Searcher.search`` method's ``maptype`` argument to control how faceted
+ documents are grouped, and/or set the ``maptype`` argument on individual
+ :class:`whoosh.sorting.FacetType`` objects to set custom grouping per facet.
+ See :doc:`../facets` for more information.
+
+* Calling ``Searcher.documents()`` or ``Searcher.document_numbers()`` with no
+ arguments now yields all documents/numbers.
+
+* Calling ``Writer.update_document()`` with no unique fields is now equivalent
+ to calling ``Writer.add_document()`` with the same arguments.
+
+* Fixed a problem with keyword expansion where the code was building a cache
+ that was fast on small indexes, but unacceptably slow on large indexes.
+
+* Added the hyphen (``-``) to the list of characters that match a "wildcard"
+ token, to make parsing slightly more predictable. A true fix will have to
+ wait for another parser rewrite.
+
+* Fixed an unused ``__future__`` import and use of ``float("nan")`` which were
+ breaking under Python 2.5.
+
+* Fixed a bug where vectored fields with only one term stored an empty term
+ vector.
+
+* Various other bug fixes.
+
+Whoosh 2.2
+==========
+
+* Fixes several bugs, including a bad bug in BM25F scoring.
+
+* Added ``allow_overlap`` option to :class:`whoosh.sorting.StoredFieldFacet`.
+
+* In :meth:`~whoosh.writing.IndexWriter.add_document`, You can now pass
+ query-like strings for BOOLEAN and DATETIME fields (e.g ``boolfield="true"``
+ and ``dtfield="20101131-16:01"``) as an alternative to actual ``bool`` or
+ ``datetime`` objects. The implementation of this is incomplete: it only works
+ in the default ``filedb`` backend, and if the field is stored, the stored
+ value will be the string, not the parsed object.
+
+* Added :class:`whoosh.analysis.CompoundWordFilter` and
+ :class:`whoosh.analysis.TeeFilter`.
+
+
+Whoosh 2.1
+==========
+
+This release fixes several bugs, and contains speed improvments to highlighting.
+See :doc:`/highlight` for more information.
+
+
+Whoosh 2.0
+==========
+
+Improvements
+------------
+
+* Whoosh is now compatible with Python 3 (tested with Python 3.2). Special
+ thanks to Vinay Sajip who did the work, and also Jordan Sherer who helped
+ fix later issues.
+
+* Sorting and grouping (faceting) now use a new system of "facet" objects which
+ are much more flexible than the previous field-based system.
+
+ For example, to sort by first name and then score::
+
+ from whoosh import sorting
+
+ mf = sorting.MultiFacet([sorting.FieldFacet("firstname"),
+ sorting.ScoreFacet()])
+ results = searcher.search(myquery, sortedby=mf)
+
+ In addition to the previously supported sorting/grouping by field contents
+ and/or query results, you can now use numeric ranges, date ranges, score, and
+ more. The new faceting system also supports overlapping groups.
+
+ (The old "Sorter" API still works but is deprecated and may be removed in a
+ future version.)
+
+ See :doc:`/facets` for more information.
+
+* Completely revamped spell-checking to make it much faster, easier, and more
+ flexible. You can enable generation of the graph files use by spell checking
+ using the ``spelling=True`` argument to a field type::
+
+ schema = fields.Schema(text=fields.TEXT(spelling=True))
+
+ (Spelling suggestion methods will work on fields without ``spelling=True``
+ but will slower.) The spelling graph will be updated automatically as new
+ documents are added -- it is no longer necessary to maintain a separate
+ "spelling index".
+
+ You can get suggestions for individual words using
+ :meth:`whoosh.searching.Searcher.suggest`::
+
+ suglist = searcher.suggest("content", "werd", limit=3)
+
+ Whoosh now includes convenience methods to spell-check and correct user
+ queries, with optional highlighting of corrections using the
+ ``whoosh.highlight`` module::
+
+ from whoosh import highlight, qparser
+
+ # User query string
+ qstring = request.get("q")
+
+ # Parse into query object
+ parser = qparser.QueryParser("content", myindex.schema)
+ qobject = parser.parse(qstring)
+
+ results = searcher.search(qobject)
+
+ if not results:
+ correction = searcher.correct_query(gobject, gstring)
+ # correction.query = corrected query object
+ # correction.string = corrected query string
+
+ # Format the corrected query string with HTML highlighting
+ cstring = correction.format_string(highlight.HtmlFormatter())
+
+ Spelling suggestions can come from field contents and/or lists of words.
+ For stemmed fields the spelling suggestions automatically use the unstemmed
+ forms of the words.
+
+ There are APIs for spelling suggestions and query correction, so highly
+ motivated users could conceivably replace the defaults with more
+ sophisticated behaviors (for example, to take context into account).
+
+ See :doc:`/spelling` for more information.
+
+* :class:`whoosh.query.FuzzyTerm` now uses the new word graph feature as well
+ and so is much faster.
+
+* You can now set a boost factor for individual documents as you index them,
+ to increase the score of terms in those documents in searches. See the
+ documentation for the :meth:`~whoosh.writing.IndexWriter.add_document` for
+ more information.
+
+* Added built-in recording of which terms matched in which documents. Use the
+ ``terms=True`` argument to :meth:`whoosh.searching.Searcher.search` and use
+ :meth:`whoosh.searching.Hit.matched_terms` and
+ :meth:`whoosh.searching.Hit.contains_term` to check matched terms.
+
+* Whoosh now supports whole-term quality optimizations, so for example if the
+ system knows that a UnionMatcher cannot possibly contribute to the "top N"
+ results unless both sub-matchers match, it will replace the UnionMatcher with
+ an IntersectionMatcher which is faster to compute. The performance improvement
+ is not as dramatic as from block quality optimizations, but it can be
+ noticeable.
+
+* Fixed a bug that prevented block quality optimizations in queries with words
+ not in the index, which could severely degrade performance.
+
+* Block quality optimizations now use the actual scoring algorithm to calculate
+ block quality instead of an approximation, which fixes issues where ordering
+ of results could be different for searches with and without the optimizations.
+
+* the BOOLEAN field type now supports field boosts.
+
+* Re-architected the query parser to make the code easier to understand. Custom
+ parser plugins from previous versions will probably break in Whoosh 2.0.
+
+* Various bug-fixes and performance improvements.
+
+* Removed the "read lock", which caused more problems than it solved. Now when
+ opening a reader, if segments are deleted out from under the reader as it
+ is opened, the code simply retries.
+
+
+Compatibility
+-------------
+
+* The term quality optimizations required changes to the on-disk formats.
+ Whoosh 2.0 if backwards-compatible with the old format. As you rewrite an
+ index using Whoosh 2.0, by default it will use the new formats for new
+ segments, making the index incompatible with older versions.
+
+ To upgrade an existing index to use the new formats immediately, use
+ ``Index.optimize()``.
+
+* Removed the experimental ``TermTrackingCollector`` since it is replaced by
+ the new built-in term recording functionality.
+
+* Removed the experimental ``Searcher.define_facets`` feature until a future
+ release when it will be replaced by a more robust and useful feature.
+
+* Reader iteration methods (``__iter__``, ``iter_from``, ``iter_field``, etc.)
+ now yield :class:`whoosh.reading.TermInfo` objects.
+
+* The arguments to :class:`whoosh.query.FuzzyTerm` changed.
+
+
+
diff --git a/docs/source/releases/index.rst b/docs/source/releases/index.rst
new file mode 100644
index 0000000..cf63ae8
--- /dev/null
+++ b/docs/source/releases/index.rst
@@ -0,0 +1,11 @@
+=============
+Release notes
+=============
+
+.. toctree::
+ :maxdepth: 2
+
+ 2_0
+ 1_0
+ 0_3
+
diff --git a/docs/source/schema.rst b/docs/source/schema.rst
new file mode 100644
index 0000000..a7d9fab
--- /dev/null
+++ b/docs/source/schema.rst
@@ -0,0 +1,377 @@
+==================
+Designing a schema
+==================
+
+About schemas and fields
+========================
+
+The schema specifies the fields of documents in an index.
+
+Each document can have multiple fields, such as title, content, url, date, etc.
+
+Some fields can be indexed, and some fields can be stored with the document so
+the field value is available in search results.
+Some fields will be both indexed and stored.
+
+The schema is the set of all possible fields in a document. Each individual
+document might only use a subset of the available fields in the schema.
+
+For example, a simple schema for indexing emails might have fields like
+``from_addr``, ``to_addr``, ``subject``, ``body``, and ``attachments``, where
+the ``attachments`` field lists the names of attachments to the email. For
+emails without attachments, you would omit the attachments field.
+
+
+Built-in field types
+====================
+
+Whoosh provides some useful predefined field types:
+
+:class:`whoosh.fields.TEXT`
+ This type is for body text. It indexes (and optionally stores) the text and
+ stores term positions to allow phrase searching.
+
+ ``TEXT`` fields use :class:`~whoosh.analysis.StandardAnalyzer` by default. To specify a different
+ analyzer, use the ``analyzer`` keyword argument to the constructor, e.g.
+ ``TEXT(analyzer=analysis.StemmingAnalyzer())``. See :doc:`analysis`.
+
+ By default, ``TEXT`` fields store position information for each indexed term, to
+ allow you to search for phrases. If you don't need to be able to search for
+ phrases in a text field, you can turn off storing term positions to save
+ space. Use ``TEXT(phrase=False)``.
+
+ By default, ``TEXT`` fields are not stored. Usually you will not want to store
+ the body text in the search index. Usually you have the indexed documents
+ themselves available to read or link to based on the search results, so you
+ don't need to store their text in the search index. However, in some
+ circumstances it can be useful (see :doc:`highlight`). Use
+ ``TEXT(stored=True)`` to specify that the text should be stored in the index.
+
+:class:`whoosh.fields.KEYWORD`
+ This field type is designed for space- or comma-separated keywords. This
+ type is indexed and searchable (and optionally stored). To save space, it
+ does not support phrase searching.
+
+ To store the value of the field in the index, use ``stored=True`` in the
+ constructor. To automatically lowercase the keywords before indexing them,
+ use ``lowercase=True``.
+
+ By default, the keywords are space separated. To separate the keywords by
+ commas instead (to allow keywords containing spaces), use ``commas=True``.
+
+ If your users will use the keyword field for searching, use ``scorable=True``.
+
+:class:`whoosh.fields.ID`
+ The ``ID`` field type simply indexes (and optionally stores) the entire value of
+ the field as a single unit (that is, it doesn't break it up into individual
+ terms). This type of field does not store frequency information, so it's
+ quite compact, but not very useful for scoring.
+
+ Use ``ID`` for fields like url or path (the URL or file path of a document),
+ date, category -- fields where the value must be treated as a whole, and
+ each document only has one value for the field.
+
+ By default, ``ID`` fields are not stored. Use ``ID(stored=True)`` to specify that
+ the value of the field should be stored with the document for use in the
+ search results. For example, you would want to store the value of a url
+ field so you could provide links to the original in your search results.
+
+:class:`whoosh.fields.STORED`
+ This field is stored with the document, but not indexed and not searchable.
+ This is useful for document information you want to display to the user in
+ the search results, but don't need to be able to search for.
+
+:class:`whoosh.fields.NUMERIC`
+ This field stores int, long, or floating point numbers in a compact,
+ sortable format.
+
+:class:`whoosh.fields.DATETIME`
+ This field stores datetime objects in a compact, sortable format.
+
+:class:`whoosh.fields.BOOLEAN`
+ This simple filed indexes boolean values and allows users to search for
+ ``yes``, ``no``, ``true``, ``false``, ``1``, ``0``, ``t`` or ``f``.
+
+:class:`whoosh.fields.NGRAM`
+ TBD.
+
+Expert users can create their own field types.
+
+
+Creating a Schema
+=================
+
+To create a schema::
+
+ from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
+ from whoosh.analysis import StemmingAnalyzer
+
+ schema = Schema(from_addr=ID(stored=True),
+ to_addr=ID(stored=True),
+ subject=TEXT(stored=True),
+ body=TEXT(analyzer=StemmingAnalyzer()),
+ tags=KEYWORD)
+
+If you aren't specifying any constructor keyword arguments to one of the
+predefined fields, you can leave off the brackets (e.g. ``fieldname=TEXT`` instead
+of ``fieldname=TEXT()``). Whoosh will instantiate the class for you.
+
+Alternatively you can create a schema declaratively using the ``SchemaClass``
+base class::
+
+ from whoosh.fields import SchemaClass, TEXT, KEYWORD, ID, STORED
+
+ class MySchema(SchemaClass):
+ path = ID(stored=True)
+ title = TEXT(stored=True)
+ content = TEXT
+ tags = KEYWORD
+
+You can pass a declarative class to :func:`~whoosh.index.create_in` or
+:meth:`~whoosh.store.Storage.create_index()` instead of a
+:class:`~whoosh.fields.Schema` instance.
+
+
+Modifying the schema after indexing
+===================================
+
+After you have created an index, you can add or remove fields to the schema
+using the ``add_field()`` and ``remove_field()`` methods. These methods are
+on the ``Writer`` object::
+
+ writer = ix.writer()
+ writer.add_field("fieldname", fields.TEXT(stored=True))
+ writer.remove_field("content")
+ writer.commit()
+
+(If you're going to modify the schema *and* add documents using the same
+writer, you must call ``add_field()`` and/or ``remove_field`` *before* you
+add any documents.)
+
+These methods are also on the ``Index`` object as a convenience, but when you
+call them on an ``Index``, the Index object simply creates the writer, calls
+the corresponding method on it, and commits, so if you want to add or remove
+more than one field, it's much more efficient to create the writer yourself::
+
+ ix.add_field("fieldname", fields.KEYWORD)
+
+In the ``filedb`` backend, removing a field simply removes that field from the
+*schema* -- the index will not get smaller, data about that field will remain
+in the index until you optimize. Optimizing will compact the index, removing
+references to the deleted field as it goes::
+
+ writer = ix.writer()
+ writer.add_field("uuid", fields.ID(stored=True))
+ writer.remove_field("path")
+ writer.commit(optimize=True)
+
+Because data is stored on disk with the field name, *do not* add a new field with
+the same name as a deleted field without optimizing the index in between::
+
+ writer = ix.writer()
+ writer.delete_field("path")
+ # Don't do this!!!
+ writer.add_field("path", fields.KEYWORD)
+
+(A future version of Whoosh may automatically prevent this error.)
+
+
+Dynamic fields
+==============
+
+Dynamic fields let you associate a field type with any field name that matches
+a given "glob" (a name pattern containing ``*``, ``?``, and/or ``[abc]``
+wildcards).
+
+You can add dynamic fields to a new schema using the ``add()`` method with the
+``glob`` keyword set to True::
+
+ schema = fields.Schema(...)
+ # Any name ending in "_d" will be treated as a stored
+ # DATETIME field
+ schema.add("*_d", fields.DATETIME(stored=True), glob=True)
+
+To set up a dynamic field on an existing index, use the same
+``IndexWriter.add_field`` method as if you were adding a regular field, but
+with the ``glob`` keyword argument set to ``True``::
+
+ writer = ix.writer()
+ writer.add_field("*_d", fields.DATETIME(stored=True), glob=True)
+ writer.commit()
+
+To remove a dynamic field, use the ``IndexWriter.remove_field()`` method with
+the glob as the name::
+
+ writer = ix.writer()
+ writer.remove_field("*_d")
+ writer.commit()
+
+For example, to allow documents to contain any field name that ends in ``_id``
+and associate it with the ``ID`` field type::
+
+ schema = fields.Schema(path=fields.ID)
+ schema.add("*_id", fields.ID, glob=True)
+
+ ix = index.create_in("myindex", schema)
+
+ w = ix.writer()
+ w.add_document(path=u"/a", test_id=u"alfa")
+ w.add_document(path=u"/b", class_id=u"MyClass")
+ # ...
+ w.commit()
+
+ qp = qparser.QueryParser("path", schema=schema)
+ q = qp.parse(u"test_id:alfa")
+ with ix.searcher() as s:
+ results = s.search(q)
+
+
+Advanced schema setup
+=====================
+
+Field boosts
+------------
+
+You can specify a field boost for a field. This is a multiplier applied to the
+score of any term found in the field. For example, to make terms found in the
+title field score twice as high as terms in the body field::
+
+ schema = Schema(title=TEXT(field_boost=2.0), body=TEXT)
+
+
+Field types
+-----------
+
+The predefined field types listed above are subclasses of ``fields.FieldType``.
+``FieldType`` is a pretty simple class. Its attributes contain information that
+define the behavior of a field.
+
+============ =============== ======================================================
+Attribute Type Description
+============ =============== ======================================================
+format fields.Format Defines what kind of information a field records
+ about each term, and how the information is stored
+ on disk.
+vector fields.Format Optional: if defined, the format in which to store
+ per-document forward-index information for this field.
+scorable bool If True, the length of (number of terms in) the field in
+ each document is stored in the index. Slightly misnamed,
+ since field lengths are not required for all scoring.
+ However, field lengths are required to get proper
+ results from BM25F.
+stored bool If True, the value of this field is stored
+ in the index.
+unique bool If True, the value of this field may be used to
+ replace documents with the same value when the user
+ calls
+ :meth:`~whoosh.writing.IndexWriter.document_update`
+ on an ``IndexWriter``.
+============ =============== ======================================================
+
+The constructors for most of the predefined field types have parameters that let
+you customize these parts. For example:
+
+* Most of the predefined field types take a stored keyword argument that sets
+ FieldType.stored.
+
+* The ``TEXT()`` constructor takes an ``analyzer`` keyword argument that is
+ passed on to the format object.
+
+Formats
+-------
+
+A ``Format`` object defines what kind of information a field records about each
+term, and how the information is stored on disk.
+
+For example, the ``Existence`` format would store postings like this:
+
+==== ====
+Doc
+==== ====
+10
+20
+30
+==== ====
+
+Whereas the ``Positions`` format would store postings like this:
+
+===== =============
+Doc Positions
+===== =============
+10 ``[1,5,23]``
+20 ``[45]``
+30 ``[7,12]``
+===== =============
+
+The indexing code passes the unicode string for a field to the field's ``Format``
+object. The ``Format`` object calls its analyzer (see text analysis) to break the
+string into tokens, then encodes information about each token.
+
+Whoosh ships with the following pre-defined formats.
+
+=============== ================================================================
+Class name Description
+=============== ================================================================
+Stored A "null" format for fields that are stored but not indexed.
+Existence Records only whether a term is in a document or not, i.e. it
+ does not store term frequency. Useful for identifier fields
+ (e.g. path or id) and "tag"-type fields, where the frequency
+ is expected to always be 0 or 1.
+Frequency Stores the number of times each term appears in each document.
+Positions Stores the number of times each term appears in each document,
+ and at what positions.
+=============== ================================================================
+
+The ``STORED`` field type uses the ``Stored`` format (which does nothing, so ``STORED``
+fields are not indexed). The ``ID`` type uses the ``Existence`` format. The ``KEYWORD`` type
+uses the ``Frequency`` format. The ``TEXT`` type uses the ``Positions`` format if it is
+instantiated with ``phrase=True`` (the default), or ``Frequency`` if ``phrase=False``.
+
+In addition, the following formats are implemented for the possible convenience
+of expert users, but are not currently used in Whoosh:
+
+================= ================================================================
+Class name Description
+================= ================================================================
+DocBoosts Like Existence, but also stores per-document boosts
+Characters Like Positions, but also stores the start and end character
+ indices of each term
+PositionBoosts Like Positions, but also stores per-position boosts
+CharacterBoosts Like Positions, but also stores the start and end character
+ indices of each term and per-position boosts
+================= ================================================================
+
+Vectors
+-------
+
+The main index is an inverted index. It maps terms to the documents they appear
+in. It is also sometimes useful to store a forward index, also known as a term
+vector, that maps documents to the terms that appear in them.
+
+For example, imagine an inverted index like this for a field:
+
+========== =========================================================
+Term Postings
+========== =========================================================
+apple ``[(doc=1, freq=2), (doc=2, freq=5), (doc=3, freq=1)]``
+bear ``[(doc=2, freq=7)]``
+========== =========================================================
+
+The corresponding forward index, or term vector, would be:
+
+========== ======================================================
+Doc Postings
+========== ======================================================
+1 ``[(text=apple, freq=2)]``
+2 ``[(text=apple, freq=5), (text='bear', freq=7)]``
+3 ``[(text=apple, freq=1)]``
+========== ======================================================
+
+If you set ``FieldType.vector`` to a ``Format`` object, the indexing code will use the
+``Format`` object to store information about the terms in each document. Currently
+by default Whoosh does not make use of term vectors at all, but they are
+available to expert users who want to implement their own field types.
+
+
+
+
diff --git a/docs/source/searching.rst b/docs/source/searching.rst
new file mode 100644
index 0000000..ab4f2a9
--- /dev/null
+++ b/docs/source/searching.rst
@@ -0,0 +1,400 @@
+=============
+How to search
+=============
+
+Once you've created an index and added documents to it, you can search for those
+documents.
+
+The ``Searcher`` object
+=======================
+
+To get a :class:`whoosh.searching.Searcher` object, call ``searcher()`` on your
+``Index`` object::
+
+ searcher = myindex.searcher()
+
+You'll usually want to open the searcher using a ``with`` statement so the
+searcher is automatically closed when you're done with it (searcher objects
+represent a number of open files, so if you don't explicitly close them and the
+system is slow to collect them, you can run out of file handles)::
+
+ with ix.searcher() as searcher:
+ ...
+
+This is of course equivalent to::
+
+ try:
+ searcher = ix.searcher()
+ ...
+ finally:
+ searcher.close()
+
+The ``Searcher`` object is the main high-level interface for reading the index. It
+has lots of useful methods for getting information about the index, such as
+``lexicon(fieldname)``.
+
+::
+
+ >>> list(searcher.lexicon("content"))
+ [u"document", u"index", u"whoosh"]
+
+However, the most important method on the ``Searcher`` object is
+:meth:`~whoosh.searching.Searcher.search`, which takes a
+:class:`whoosh.query.Query` object and returns a
+:class:`~whoosh.searching.Results` object::
+
+ from whoosh.qparser import QueryParser
+
+ qp = QueryParser("content", schema=myindex.schema)
+ q = qp.parse(u"hello world")
+
+ with myindex.searcher() as s:
+ results = s.search(q)
+
+By default the results contains at most the first 10 matching documents. To get
+more results, use the ``limit`` keyword::
+
+ results = s.search(q, limit=20)
+
+If you want all results, use ``limit=None``. However, setting the limit whenever
+possible makes searches faster because Whoosh doesn't need to examine and score
+every document.
+
+Since displaying a page of results at a time is a common pattern, the
+``search_page`` method lets you conveniently retrieve only the results on a
+given page::
+
+ results = s.search_page(q, 1)
+
+The default page length is 10 hits. You can use the ``pagelen`` keyword argument
+to set a different page length::
+
+ results = s.search_page(q, 5, pagelen=20)
+
+
+Results object
+==============
+
+The :class:`~whoosh.searching.Results` object acts like a list of the matched
+documents. You can use it to access the stored fields of each hit document, to
+display to the user.
+
+::
+
+ >>> # Show the best hit's stored fields
+ >>> results[0]
+ {"title": u"Hello World in Python", "path": u"/a/b/c"}
+ >>> results[0:2]
+ [{"title": u"Hello World in Python", "path": u"/a/b/c"},
+ {"title": u"Foo", "path": u"/bar"}]
+
+By default, ``Searcher.search(myquery)`` limits the number of hits to 20, So the
+number of scored hits in the ``Results`` object may be less than the number of
+matching documents in the index.
+
+::
+
+ >>> # How many documents in the entire index would have matched?
+ >>> len(results)
+ 27
+ >>> # How many scored and sorted documents in this Results object?
+ >>> # This will often be less than len() if the number of hits was limited
+ >>> # (the default).
+ >>> results.scored_length()
+ 10
+
+Calling ``len(Results)`` runs a fast (unscored) version of the query again to
+figure out the total number of matching documents. This is usually very fast
+but for large indexes it can cause a noticeable delay. If you want to avoid
+this delay on very large indexes, you can use the
+:meth:`~whoosh.searching.Results.has_exact_length`,
+:meth:`~whoosh.searching.Results.estimated_length`, and
+:meth:`~whoosh.searching.Results.estimated_min_length` methods to estimate the
+number of matching documents without calling ``len()``::
+
+ found = results.scored_length()
+ if results.has_exact_length():
+ print("Scored", found, "of exactly", len(results), "documents")
+ else:
+ low = results.estimated_min_length()
+ high = results.estimated_length()
+
+ print("Scored", found, "of between", low, "and", high, "documents")
+
+
+Scoring and sorting
+===================
+
+Scoring
+-------
+
+Normally the list of result documents is sorted by *score*. The
+:mod:`whoosh.scoring` module contains implementations of various scoring
+algorithms. The default is :class:`~whoosh.scoring.BM25F`.
+
+You can set the scoring object to use when you create the searcher using the
+``weighting`` keyword argument::
+
+ from whoosh import scoring
+
+ with myindex.searcher(weighting=scoring.TF_IDF()) as s:
+ ...
+
+A weighting model is a :class:`~whoosh.scoring.WeightingModel` subclass with a
+``scorer()`` method that produces a "scorer" instance. This instance has a
+method that takes the current matcher and returns a floating point score.
+
+Sorting
+-------
+
+See :doc:`facets`.
+
+
+Highlighting snippets and More Like This
+========================================
+
+See :doc:`highlight` and :doc:`keywords` for information on these topics.
+
+
+Filtering results
+=================
+
+You can use the ``filter`` keyword argument to ``search()`` to specify a set of
+documents to permit in the results. The argument can be a
+:class:`whoosh.query.Query` object, a :class:`whoosh.searching.Results` object,
+or a set-like object containing document numbers. The searcher caches filters
+so if for example you use the same query filter with a searcher multiple times,
+the additional searches will be faster because the searcher will cache the
+results of running the filter query
+
+You can also specify a ``mask`` keyword argument to specify a set of documents
+that are not permitted in the results.
+
+::
+
+ with myindex.searcher() as s:
+ qp = qparser.QueryParser("content", myindex.schema)
+ user_q = qp.parse(query_string)
+
+ # Only show documents in the "rendering" chapter
+ allow_q = query.Term("chapter", "rendering")
+ # Don't show any documents where the "tag" field contains "todo"
+ restrict_q = query.Term("tag", "todo")
+
+ results = s.search(user_q, filter=allow_q, mask=restrict_q)
+
+(If you specify both a ``filter`` and a ``mask``, and a matching document
+appears in both, the ``mask`` "wins" and the document is not permitted.)
+
+To find out how many results were filtered out of the results, use
+``results.filtered_count`` (or ``resultspage.results.filtered_count``)::
+
+ with myindex.searcher() as s:
+ qp = qparser.QueryParser("content", myindex.schema)
+ user_q = qp.parse(query_string)
+
+ # Filter documents older than 7 days
+ old_q = query.DateRange("created", None, datetime.now() - timedelta(days=7))
+ results = s.search(user_q, mask=old_q)
+
+ print("Filtered out %d older documents" % results.filtered_count)
+
+
+Which terms from my query matched?
+==================================
+
+You can use the ``terms=True`` keyword argument to ``search()`` to have the
+search record which terms in the query matched which documents::
+
+ with myindex.searcher() as s:
+ results = s.seach(myquery, terms=True)
+
+You can then get information about which terms matched from the
+:class:`whoosh.searching.Results` and :class:`whoosh.searching.Hit` objects::
+
+ # Was this results object created with terms=True?
+ if results.has_matched_terms():
+ # What terms matched in the results?
+ print(results.matched_terms())
+
+ # What terms matched in each hit?
+ for hit in results:
+ print(hit.matched_terms())
+
+
+.. _collapsing:
+
+Collapsing results
+==================
+
+Whoosh lets you eliminate all but the top N documents with the same facet key
+from the results. This can be useful in a few situations:
+
+* Eliminating duplicates at search time.
+
+* Restricting the number of matches per source. For example, in a web search
+ application, you might want to show at most three matches from any website.
+
+Whether a document should be collapsed is determined by the value of a "collapse
+facet". If a document has an empty collapse key, it will never be collapsed,
+but otherwise only the top N documents with the same collapse key will appear
+in the results.
+
+See :doc:`/facets` for information on facets.
+
+::
+
+ with myindex.searcher() as s:
+ # Set the facet to collapse on and the maximum number of documents per
+ # facet value (default is 1)
+ results = s.collector(collapse="hostname", collapse_limit=3)
+
+ # Dictionary mapping collapse keys to the number of documents that
+ # were filtered out by collapsing on that key
+ print(results.collapsed_counts)
+
+Collapsing works with both scored and sorted results. You can use any of the
+facet types available in the :mod:`whoosh.sorting` module.
+
+By default, Whoosh uses the results order (score or sort key) to determine the
+documents to collapse. For example, in scored results, the best scoring
+documents would be kept. You can optionally specify a ``collapse_order`` facet
+to control which documents to keep when collapsing.
+
+For example, in a product search you could display results sorted by decreasing
+price, and eliminate all but the highest rated item of each product type::
+
+ from whoosh import sorting
+
+ with myindex.searcher() as s:
+ price_facet = sorting.FieldFacet("price", reverse=True)
+ type_facet = sorting.FieldFacet("type")
+ rating_facet = sorting.FieldFacet("rating", reverse=True)
+
+ results = s.collector(sortedby=price_facet, # Sort by reverse price
+ collapse=type_facet, # Collapse on product type
+ collapse_order=rating_facet # Collapse to highest rated
+ )
+
+The collapsing happens during the search, so it is usually more efficient than
+finding everything and post-processing the results. However, if the collapsing
+eliminates a large number of documents, collapsed search can take longer
+because the search has to consider more documents and remove many
+already-collected documents.
+
+Since this collector must sometimes go back and remove already-collected
+documents, if you use it in combination with
+:class:`~whoosh.collectors.TermsCollector` and/or
+:class:`~whoosh.collectors.FacetCollector`, those collectors may contain
+information about documents that were filtered out of the final results by
+collapsing.
+
+
+Time limited searches
+=====================
+
+To limit the amount of time a search can take::
+
+ from whoosh.collectors import TimeLimitCollector, TimeLimit
+
+ with myindex.searcher() as s:
+ # Get a collector object
+ c = s.collector(limit=None, sortedby="title_exact")
+ # Wrap it in a TimeLimitedCollector and set the time limit to 10 seconds
+ tlc = TimeLimitedCollector(c, timelimit=10.0)
+
+ # Try searching
+ try:
+ s.search_with_collector(myquery, tlc)
+ except TimeLimit:
+ print("Search took too long, aborting!")
+
+ # You can still get partial results from the collector
+ results = tlc.results()
+
+
+Convenience methods
+===================
+
+The :meth:`~whoosh.searching.Searcher.document` and
+:meth:`~whoosh.searching.Searcher.documents` methods on the ``Searcher`` object let
+you retrieve the stored fields of documents matching terms you pass in keyword
+arguments.
+
+This is especially useful for fields such as dates/times, identifiers, paths,
+and so on.
+
+::
+
+ >>> list(searcher.documents(indexeddate=u"20051225"))
+ [{"title": u"Christmas presents"}, {"title": u"Turkey dinner report"}]
+ >>> print searcher.document(path=u"/a/b/c")
+ {"title": "Document C"}
+
+These methods have some limitations:
+
+* The results are not scored.
+* Multiple keywords are always AND-ed together.
+* The entire value of each keyword argument is considered a single term; you
+ can't search for multiple terms in the same field.
+
+
+Combining Results objects
+=========================
+
+It is sometimes useful to use the results of another query to influence the
+order of a :class:`whoosh.searching.Results` object.
+
+For example, you might have a "best bet" field. This field contains hand-picked
+keywords for documents. When the user searches for those keywords, you want
+those documents to be placed at the top of the results list. You could try to
+do this by boosting the "bestbet" field tremendously, but that can have
+unpredictable effects on scoring. It's much easier to simply run the query
+twice and combine the results::
+
+ # Parse the user query
+ userquery = queryparser.parse(querystring)
+
+ # Get the terms searched for
+ termset = set()
+ userquery.existing_terms(termset)
+
+ # Formulate a "best bet" query for the terms the user
+ # searched for in the "content" field
+ bbq = Or([Term("bestbet", text) for fieldname, text
+ in termset if fieldname == "content"])
+
+ # Find documents matching the searched for terms
+ results = s.search(bbq, limit=5)
+
+ # Find documents that match the original query
+ allresults = s.search(userquery, limit=10)
+
+ # Add the user query results on to the end of the "best bet"
+ # results. If documents appear in both result sets, push them
+ # to the top of the combined results.
+ results.upgrade_and_extend(allresults)
+
+The ``Results`` object supports the following methods:
+
+``Results.extend(results)``
+ Adds the documents in 'results' on to the end of the list of result
+ documents.
+
+``Results.filter(results)``
+ Removes the documents in 'results' from the list of result documents.
+
+``Results.upgrade(results)``
+ Any result documents that also appear in 'results' are moved to the top
+ of the list of result documents.
+
+``Results.upgrade_and_extend(results)``
+ Any result documents that also appear in 'results' are moved to the top
+ of the list of result documents. Then any other documents in 'results' are
+ added on to the list of result documents.
+
+
+
+
+
+
diff --git a/docs/source/spelling.rst b/docs/source/spelling.rst
new file mode 100644
index 0000000..cc1abc8
--- /dev/null
+++ b/docs/source/spelling.rst
@@ -0,0 +1,130 @@
+=====================================================
+"Did you mean... ?" Correcting errors in user queries
+=====================================================
+
+Overview
+========
+
+Whoosh can quickly suggest replacements for mis-typed words by returning
+a list of words from the index (or a dictionary) that are close to the
+mis-typed word::
+
+ with ix.searcher() as s:
+ corrector = s.corrector("text")
+ for mistyped_word in mistyped_words:
+ print corrector.suggest(mistyped_word, limit=3)
+
+See the :meth:`whoosh.spelling.Corrector.suggest` method documentation
+for information on the arguments.
+
+Currently the suggestion engine is more like a "typo corrector" than a
+real "spell checker" since it doesn't do the kind of sophisticated
+phonetic matching or semantic/contextual analysis a good spell checker
+might. However, it is still very useful.
+
+There are two main strategies for correcting words:
+
+* Use the terms from an index field.
+
+* Use words from a word list.
+
+
+Pulling suggestions from an indexed field
+=========================================
+
+In Whoosh 2.7 and later, spelling suggestions are available on all fields.
+However, if you have an analyzer that modifies the indexed words (such as
+stemming), you can add ``spelling=True`` to a field to have it store separate
+unmodified versions of the terms for spelling suggestions::
+
+ ana = analysis.StemmingAnalyzer()
+ schema = fields.Schema(text=TEXT(analyzer=ana, spelling=True))
+
+You can then use the :meth:`whoosh.searching.Searcher.corrector` method
+to get a corrector for a field::
+
+ corrector = searcher.corrector("content")
+
+The advantage of using the contents of an index field is that when you
+are spell checking queries on that index, the suggestions are tailored
+to the contents of the index. The disadvantage is that if the indexed
+documents contain spelling errors, then the spelling suggestions will
+also be erroneous.
+
+
+Pulling suggestions from a word list
+====================================
+
+There are plenty of word lists available on the internet you can use to
+populate the spelling dictionary.
+
+(In the following examples, ``word_list`` can be a list of unicode
+strings, or a file object with one word on each line.)
+
+To create a :class:`whoosh.spelling.Corrector` object from a sorted word list::
+
+ from whoosh.spelling import ListCorrector
+
+ # word_list must be a sorted list of unicocde strings
+ corrector = ListCorrector(word_list)
+
+
+Merging two or more correctors
+==============================
+
+You can combine suggestions from two sources (for example, the contents
+of an index field and a word list) using a
+:class:`whoosh.spelling.MultiCorrector`::
+
+ c1 = searcher.corrector("content")
+ c2 = spelling.ListCorrector(word_list)
+ corrector = MultiCorrector([c1, c2])
+
+
+Correcting user queries
+=======================
+
+You can spell-check a user query using the
+:meth:`whoosh.searching.Searcher.correct_query` method::
+
+ from whoosh import qparser
+
+ # Parse the user query string
+ qp = qparser.QueryParser("content", myindex.schema)
+ q = qp.parse(qstring)
+
+ # Try correcting the query
+ with myindex.searcher() as s:
+ corrected = s.correct_query(q, qstring)
+ if corrected.query != q:
+ print("Did you mean:", corrected.string)
+
+The ``correct_query`` method returns an object with the following
+attributes:
+
+``query``
+ A corrected :class:`whoosh.query.Query` tree. You can test
+ whether this is equal (``==``) to the original parsed query to
+ check if the corrector actually changed anything.
+
+``string``
+ A corrected version of the user's query string.
+
+``tokens``
+ A list of corrected token objects representing the corrected
+ terms. You can use this to reformat the user query (see below).
+
+
+You can use a :class:`whoosh.highlight.Formatter` object to format the
+corrected query string. For example, use the
+:class:`~whoosh.highlight.HtmlFormatter` to format the corrected string
+as HTML::
+
+ from whoosh import highlight
+
+ hf = highlight.HtmlFormatter()
+ corrected = s.correct_query(q, qstring, formatter=hf)
+
+See the documentation for
+:meth:`whoosh.searching.Searcher.correct_query` for information on the
+defaults and arguments.
diff --git a/docs/source/stemming.rst b/docs/source/stemming.rst
new file mode 100644
index 0000000..9f1d738
--- /dev/null
+++ b/docs/source/stemming.rst
@@ -0,0 +1,217 @@
+========================================
+Stemming, variations, and accent folding
+========================================
+
+The problem
+===========
+
+The indexed text will often contain words in different form than the one
+the user searches for. For example, if the user searches for ``render``, we
+would like the search to match not only documents that contain the ``render``,
+but also ``renders``, ``rendering``, ``rendered``, etc.
+
+A related problem is one of accents. Names and loan words may contain accents in
+the original text but not in the user's query, or vice versa. For example, we
+want the user to be able to search for ``cafe`` and find documents containing
+``café``.
+
+The default analyzer for the :class:`whoosh.fields.TEXT` field does not do
+stemming or accent folding.
+
+
+Stemming
+========
+
+Stemming is a heuristic process of removing suffixes (and sometimes prefixes)
+from words to arrive (hopefully, most of the time) at the base word. Whoosh
+includes several stemming algorithms such as Porter and Porter2, Paice Husk,
+and Lovins.
+
+::
+
+ >>> from whoosh.lang.porter import stem
+ >>> stem("rendering")
+ 'render'
+
+The stemming filter applies the stemming function to the terms it indexes, and
+to words in user queries. So in theory all variations of a root word ("render",
+"rendered", "renders", "rendering", etc.) are reduced to a single term in the
+index, saving space. And all possible variations users might use in a query
+are reduced to the root, so stemming enhances "recall".
+
+The :class:`whoosh.analysis.StemFilter` lets you add a stemming filter to an
+analyzer chain.
+
+::
+
+ >>> rext = RegexTokenizer()
+ >>> stream = rext(u"fundamentally willows")
+ >>> stemmer = StemFilter()
+ >>> [token.text for token in stemmer(stream)]
+ [u"fundament", u"willow"]
+
+The :func:`whoosh.analysis.StemmingAnalyzer` is a pre-packaged analyzer that
+combines a tokenizer, lower-case filter, optional stop filter, and stem filter::
+
+ from whoosh import fields
+ from whoosh.analysis import StemmingAnalyzer
+
+ stem_ana = StemmingAnalyzer()
+ schema = fields.Schema(title=TEXT(analyzer=stem_ana, stored=True),
+ content=TEXT(analyzer=stem_ana))
+
+Stemming has pros and cons.
+
+* It allows the user to find documents without worrying about word forms.
+
+* It reduces the size of the index, since it reduces the number of separate
+ terms indexed by "collapsing" multiple word forms into a single base word.
+
+* It's faster than using variations (see below)
+
+* The stemming algorithm can sometimes incorrectly conflate words or change
+ the meaning of a word by removing suffixes.
+
+* The stemmed forms are often not proper words, so the terms in the field
+ are not useful for things like creating a spelling dictionary.
+
+
+Variations
+==========
+
+Whereas stemming encodes the words in the index in a base form, when you use
+variations you instead index words "as is" and *at query time* expand words
+in the user query using a heuristic algorithm to generate morphological
+variations of the word.
+
+::
+
+ >>> from whoosh.lang.morph_en import variations
+ >>> variations("rendered")
+ set(['rendered', 'rendernesses', 'render', 'renderless', 'rendering',
+ 'renderness', 'renderes', 'renderer', 'renderements', 'rendereless',
+ 'renderenesses', 'rendere', 'renderment', 'renderest', 'renderement',
+ 'rendereful', 'renderers', 'renderful', 'renderings', 'renders', 'renderly',
+ 'renderely', 'rendereness', 'renderments'])
+
+Many of the generated variations for a given word will not be valid words, but
+it's fairly fast for Whoosh to check which variations are actually in the
+index and only search for those.
+
+The :class:`whoosh.query.Variations` query object lets you search for variations
+of a word. Whereas the normal :class:`whoosh.query.Term` object only searches
+for the given term, the ``Variations`` query acts like an ``Or`` query for the
+variations of the given word in the index. For example, the query::
+
+ query.Variations("content", "rendered")
+
+...might act like this (depending on what words are in the index)::
+
+ query.Or([query.Term("content", "render"), query.Term("content", "rendered"),
+ query.Term("content", "renders"), query.Term("content", "rendering")])
+
+To have the query parser use :class:`whoosh.query.Variations` instead of
+:class:`whoosh.query.Term` for individual terms, use the ``termclass``
+keyword argument to the parser initialization method::
+
+ from whoosh import qparser, query
+
+ qp = qparser.QueryParser("content", termclass=query.Variations)
+
+Variations has pros and cons.
+
+* It allows the user to find documents without worrying about word forms.
+
+* The terms in the field are actual words, not stems, so you can use the
+ field's contents for other purposes such as spell checking queries.
+
+* It increases the size of the index relative to stemming, because different
+ word forms are indexed separately.
+
+* It acts like an ``Or`` search for all the variations, which is slower than
+ searching for a single term.
+
+
+Lemmatization
+=============
+
+Whereas stemming is a somewhat "brute force", mechanical attempt at reducing
+words to their base form using simple rules, lemmatization usually refers to
+more sophisticated methods of finding the base form ("lemma") of a word using
+language models, often involving analysis of the surrounding context and
+part-of-speech tagging.
+
+Whoosh does not include any lemmatization functions, but if you have separate
+lemmatizing code you could write a custom :class:`whoosh.analysis.Filter`
+to integrate it into a Whoosh analyzer.
+
+
+Character folding
+=================
+
+You can set up an analyzer to treat, for example, ``á``, ``a``, ``å``, and ``â``
+as equivalent to improve recall. This is often very useful, allowing the user
+to, for example, type ``cafe`` or ``resume`` and find documents containing
+``café`` and ``resumé``.
+
+Character folding is especially useful for unicode characters that may appear
+in Asian language texts that should be treated as equivalent to their ASCII
+equivalent, such as "half-width" characters.
+
+Character folding is not always a panacea. See this article for caveats on where
+accent folding can break down.
+
+http://www.alistapart.com/articles/accent-folding-for-auto-complete/
+
+Whoosh includes several mechanisms for adding character folding to an analyzer.
+
+The :class:`whoosh.analysis.CharsetFilter` applies a character map to token
+text. For example, it will filter the tokens ``u'café', u'resumé', ...`` to
+``u'cafe', u'resume', ...``. This is usually the method you'll want to use
+unless you need to use a charset to tokenize terms::
+
+ from whoosh.analysis import CharsetFilter, StemmingAnalyzer
+ from whoosh import fields
+ from whoosh.support.charset import accent_map
+
+ # For example, to add an accent-folding filter to a stemming analyzer:
+ my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
+
+ # To use this analyzer in your schema:
+ my_schema = fields.Schema(content=fields.TEXT(analyzer=my_analyzer))
+
+The :class:`whoosh.analysis.CharsetTokenizer` uses a Sphinx charset table to
+both separate terms and perform character folding. This tokenizer is slower
+than the :class:`whoosh.analysis.RegexTokenizer` because it loops over each
+character in Python. If the language(s) you're indexing can be tokenized using
+regular expressions, it will be much faster to use ``RegexTokenizer`` and
+``CharsetFilter`` in combination instead of using ``CharsetTokenizer``.
+
+The :mod:`whoosh.support.charset` module contains an accent folding map useful
+for most Western languages, as well as a much more extensive Sphinx charset
+table and a function to convert Sphinx charset tables into the character maps
+required by ``CharsetTokenizer`` and ``CharsetFilter``::
+
+ # To create a filter using an enourmous character map for most languages
+ # generated from a Sphinx charset table
+ from whoosh.analysis import CharsetFilter
+ from whoosh.support.charset import default_charset, charset_table_to_dict
+ charmap = charset_table_to_dict(default_charset)
+ my_analyzer = StemmingAnalyzer() | CharsetFilter(charmap)
+
+(The Sphinx charset table format is described at
+http://www.sphinxsearch.com/docs/current.html#conf-charset-table )
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/source/tech/backend.rst b/docs/source/tech/backend.rst
new file mode 100644
index 0000000..a68cbdd
--- /dev/null
+++ b/docs/source/tech/backend.rst
@@ -0,0 +1,175 @@
+==============================
+How to implement a new backend
+==============================
+
+Index
+=====
+
+* Subclass :class:`whoosh.index.Index`.
+
+* Indexes must implement the following methods.
+
+ * :meth:`whoosh.index.Index.is_empty`
+
+ * :meth:`whoosh.index.Index.doc_count`
+
+ * :meth:`whoosh.index.Index.reader`
+
+ * :meth:`whoosh.index.Index.writer`
+
+* Indexes that require/support locking must implement the following methods.
+
+ * :meth:`whoosh.index.Index.lock`
+
+ * :meth:`whoosh.index.Index.unlock`
+
+* Indexes that support deletion must implement the following methods.
+
+ * :meth:`whoosh.index.Index.delete_document`
+
+ * :meth:`whoosh.index.Index.doc_count_all` -- if the backend has delayed
+ deletion.
+
+* Indexes that require/support versioning/transactions *may* implement the following methods.
+
+ * :meth:`whoosh.index.Index.latest_generation`
+
+ * :meth:`whoosh.index.Index.up_to_date`
+
+ * :meth:`whoosh.index.Index.last_modified`
+
+* Index *may* implement the following methods (the base class's versions are no-ops).
+
+ * :meth:`whoosh.index.Index.optimize`
+
+ * :meth:`whoosh.index.Index.close`
+
+
+IndexWriter
+===========
+
+* Subclass :class:`whoosh.writing.IndexWriter`.
+
+* IndexWriters must implement the following methods.
+
+ * :meth:`whoosh.writing.IndexWriter.add_document`
+
+ * :meth:`whoosh.writing.IndexWriter.add_reader`
+
+* Backends that support deletion must implement the following methods.
+
+ * :meth:`whoosh.writing.IndexWriter.delete_document`
+
+* IndexWriters that work as transactions must implement the following methods.
+
+ * :meth:`whoosh.reading.IndexWriter.commit` -- Save the additions/deletions done with
+ this IndexWriter to the main index, and release any resources used by the IndexWriter.
+
+ * :meth:`whoosh.reading.IndexWriter.cancel` -- Throw away any additions/deletions done
+ with this IndexWriter, and release any resources used by the IndexWriter.
+
+
+IndexReader
+===========
+
+* Subclass :class:`whoosh.reading.IndexReader`.
+
+* IndexReaders must implement the following methods.
+
+ * :meth:`whoosh.reading.IndexReader.__contains__`
+
+ * :meth:`whoosh.reading.IndexReader.__iter__`
+
+ * :meth:`whoosh.reading.IndexReader.iter_from`
+
+ * :meth:`whoosh.reading.IndexReader.stored_fields`
+
+ * :meth:`whoosh.reading.IndexReader.doc_count_all`
+
+ * :meth:`whoosh.reading.IndexReader.doc_count`
+
+ * :meth:`whoosh.reading.IndexReader.doc_field_length`
+
+ * :meth:`whoosh.reading.IndexReader.field_length`
+
+ * :meth:`whoosh.reading.IndexReader.max_field_length`
+
+ * :meth:`whoosh.reading.IndexReader.postings`
+
+ * :meth:`whoosh.reading.IndexReader.has_vector`
+
+ * :meth:`whoosh.reading.IndexReader.vector`
+
+ * :meth:`whoosh.reading.IndexReader.doc_frequency`
+
+ * :meth:`whoosh.reading.IndexReader.frequency`
+
+* Backends that support deleting documents should implement the following
+ methods.
+
+ * :meth:`whoosh.reading.IndexReader.has_deletions`
+ * :meth:`whoosh.reading.IndexReader.is_deleted`
+
+* Backends that support versioning should implement the following methods.
+
+ * :meth:`whoosh.reading.IndexReader.generation`
+
+* If the IndexReader object does not keep the schema in the ``self.schema``
+ attribute, it needs to override the following methods.
+
+ * :meth:`whoosh.reading.IndexReader.field`
+
+ * :meth:`whoosh.reading.IndexReader.field_names`
+
+ * :meth:`whoosh.reading.IndexReader.scorable_names`
+
+ * :meth:`whoosh.reading.IndexReader.vector_names`
+
+* IndexReaders *may* implement the following methods.
+
+ * :meth:`whoosh.reading.DocReader.close` -- closes any open resources associated with the
+ reader.
+
+
+Matcher
+=======
+
+The :meth:`whoosh.reading.IndexReader.postings` method returns a
+:class:`whoosh.matching.Matcher` object. You will probably need to implement
+a custom Matcher class for reading from your posting lists.
+
+* Subclass :class:`whoosh.matching.Matcher`.
+
+* Implement the following methods at minimum.
+
+ * :meth:`whoosh.matching.Matcher.is_active`
+
+ * :meth:`whoosh.matching.Matcher.copy`
+
+ * :meth:`whoosh.matching.Matcher.id`
+
+ * :meth:`whoosh.matching.Matcher.next`
+
+ * :meth:`whoosh.matching.Matcher.value`
+
+ * :meth:`whoosh.matching.Matcher.value_as`
+
+ * :meth:`whoosh.matching.Matcher.score`
+
+* Depending on the implementation, you *may* implement the following methods
+ more efficiently.
+
+ * :meth:`whoosh.matching.Matcher.skip_to`
+
+ * :meth:`whoosh.matching.Matcher.weight`
+
+* If the implementation supports quality, you should implement the following
+ methods.
+
+ * :meth:`whoosh.matching.Matcher.supports_quality`
+
+ * :meth:`whoosh.matching.Matcher.quality`
+
+ * :meth:`whoosh.matching.Matcher.block_quality`
+
+ * :meth:`whoosh.matching.Matcher.skip_to_quality`
diff --git a/docs/source/tech/filedb.rst b/docs/source/tech/filedb.rst
new file mode 100644
index 0000000..439c30f
--- /dev/null
+++ b/docs/source/tech/filedb.rst
@@ -0,0 +1,29 @@
+============
+filedb notes
+============
+
+TBD.
+
+Files created
+=============
+
+<revision_number>.toc
+ The "master" file containing information about the index and its segments.
+
+The index directory will contain a set of files for each segment. A segment is like a mini-index -- when you add documents to the index, whoosh creates a new segment and then searches the old segment(s) and the new segment to avoid having to do a big merge every time you add a document. When you get enough small segments whoosh will merge them into larger segments or a single segment.
+
+<segment_number>.dci
+ Contains per-document information (e.g. field lengths). This will grow linearly with the number of documents.
+
+<segment_number>.dcz
+ Contains the stored fields for each document.
+
+<segment_number>.tiz
+ Contains per-term information. The size of file will vary based on the number of unique terms.
+
+<segment_number>.pst
+ Contains per-term postings. The size of this file depends on the size of the collection and the formats used for each field (e.g. storing term positions takes more space than storing frequency only).
+
+<segment_number>.fvz
+ contains term vectors (forward indexes) for each document. This file is only created if at least one field in the schema stores term vectors. The size will vary based on the number of documents, field length, the formats used for each vector (e.g. storing term positions takes more space than storing frequency only), etc.
+
diff --git a/docs/source/tech/index.rst b/docs/source/tech/index.rst
new file mode 100644
index 0000000..196d18f
--- /dev/null
+++ b/docs/source/tech/index.rst
@@ -0,0 +1,9 @@
+===============
+Technical notes
+===============
+
+.. toctree::
+ :glob:
+ :maxdepth: 2
+
+ *
diff --git a/docs/source/threads.rst b/docs/source/threads.rst
new file mode 100644
index 0000000..981a967
--- /dev/null
+++ b/docs/source/threads.rst
@@ -0,0 +1,74 @@
+====================================
+Concurrency, locking, and versioning
+====================================
+
+Concurrency
+===========
+
+The ``FileIndex`` object is "stateless" and should be share-able between
+threads.
+
+A ``Reader`` object (which underlies the ``Searcher`` object) wraps open files and often
+individual methods rely on consistent file cursor positions (e.g. they do two
+``file.read()``\ s in a row, so if another thread moves the cursor between the two
+read calls Bad Things would happen). You should use one Reader/Searcher per
+thread in your code.
+
+Readers/Searchers tend to cache information (such as field caches for sorting),
+so if you can share one across multiple search requests, it's a big performance
+win.
+
+
+Locking
+=======
+
+Only one thread/process can write to an index at a time. When you open a writer,
+it locks the index. If you try to open a writer on the same index in another
+thread/process, it will raise ``whoosh.store.LockError``.
+
+In a multi-threaded or multi-process environment your code needs to be aware
+that opening a writer may raise this exception if a writer is already open.
+Whoosh includes a couple of example implementations
+(:class:`whoosh.writing.AsyncWriter` and :class:`whoosh.writing.BufferedWriter`)
+of ways to work around the write lock.
+
+While the writer is open and during the commit, **the index is still available
+for reading**. Existing readers are unaffected and new readers can open the
+current index normally.
+
+
+Lock files
+----------
+
+Locking the index is accomplished by acquiring an exclusive file lock on the
+``<indexname>_WRITELOCK`` file in the index directory. The file is not deleted
+after the file lock is released, so the fact that the file exists **does not**
+mean the index is locked.
+
+
+Versioning
+==========
+
+When you open a reader/searcher, the reader represents a view of the **current
+version** of the index. If someone writes changes to the index, any readers
+that are already open **will not** pick up the changes automatically. A reader
+always sees the index as it existed when the reader was opened.
+
+If you are re-using a Searcher across multiple search requests, you can check
+whether the Searcher is a view of the latest version of the index using
+:meth:`whoosh.searching.Searcher.up_to_date`. If the searcher is not up to date,
+you can get an up-to-date copy of the searcher using
+:meth:`whoosh.searching.Searcher.refresh`::
+
+ # If 'searcher' is not up-to-date, replace it
+ searcher = searcher.refresh()
+
+(If the searcher has the latest version of the index, ``refresh()`` simply
+returns it.)
+
+Calling ``Searcher.refresh()`` is more efficient that closing the searcher and
+opening a new one, since it will re-use any underlying readers and caches that
+haven't changed.
+
+
+
diff --git a/files/whoosh.svg b/files/whoosh.svg
new file mode 100644
index 0000000..45b3db9
--- /dev/null
+++ b/files/whoosh.svg
@@ -0,0 +1,434 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://web.resource.org/cc/"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:xlink="http://www.w3.org/1999/xlink"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="64"
+ height="64"
+ id="svg1872"
+ sodipodi:version="0.32"
+ inkscape:version="0.44.1"
+ sodipodi:docbase="e:\dev_clean\src\houdini\support\icons\misc"
+ sodipodi:docname="rocketsearch3.svg"
+ version="1.0">
+ <defs
+ id="defs1874">
+ <linearGradient
+ inkscape:collect="always"
+ id="linearGradient3372">
+ <stop
+ style="stop-color:white;stop-opacity:0.50196081"
+ offset="0"
+ id="stop3374" />
+ <stop
+ style="stop-color:white;stop-opacity:0;"
+ offset="1"
+ id="stop3376" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient3264">
+ <stop
+ style="stop-color:#f4bf00;stop-opacity:1"
+ offset="0"
+ id="stop3266" />
+ <stop
+ style="stop-color:#ae0000;stop-opacity:1"
+ offset="1"
+ id="stop3268" />
+ </linearGradient>
+ <linearGradient
+ inkscape:collect="always"
+ id="linearGradient3254">
+ <stop
+ style="stop-color:#888a85;stop-opacity:1"
+ offset="0"
+ id="stop3256" />
+ <stop
+ style="stop-color:black;stop-opacity:1"
+ offset="1"
+ id="stop3258" />
+ </linearGradient>
+ <linearGradient
+ inkscape:collect="always"
+ id="linearGradient3120">
+ <stop
+ style="stop-color:black;stop-opacity:1"
+ offset="0"
+ id="stop3122" />
+ <stop
+ style="stop-color:#2e3436;stop-opacity:1"
+ offset="1"
+ id="stop3124" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient3102">
+ <stop
+ id="stop3104"
+ offset="0"
+ style="stop-color:#f30;stop-opacity:1;" />
+ <stop
+ style="stop-color:#ff967c;stop-opacity:1;"
+ offset="0.5"
+ id="stop3106" />
+ <stop
+ id="stop3108"
+ offset="1"
+ style="stop-color:#f30;stop-opacity:1" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient3092">
+ <stop
+ style="stop-color:#f30;stop-opacity:1;"
+ offset="0"
+ id="stop3094" />
+ <stop
+ id="stop3100"
+ offset="0.5"
+ style="stop-color:#831a00;stop-opacity:1;" />
+ <stop
+ style="stop-color:#f30;stop-opacity:1"
+ offset="1"
+ id="stop3096" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient2904">
+ <stop
+ style="stop-color:white;stop-opacity:0.50196081;"
+ offset="0"
+ id="stop2906" />
+ <stop
+ style="stop-color:white;stop-opacity:0;"
+ offset="1"
+ id="stop2908" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient2882">
+ <stop
+ style="stop-color:#5e5e5e;stop-opacity:1;"
+ offset="0"
+ id="stop2884" />
+ <stop
+ id="stop2890"
+ offset="0.3392857"
+ style="stop-color:#bbb;stop-opacity:1;" />
+ <stop
+ style="stop-color:white;stop-opacity:1;"
+ offset="1"
+ id="stop2886" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient2870">
+ <stop
+ style="stop-color:#c8c8c8;stop-opacity:1;"
+ offset="0"
+ id="stop2872" />
+ <stop
+ id="stop2878"
+ offset="0.5"
+ style="stop-color:#f5f5f5;stop-opacity:1;" />
+ <stop
+ style="stop-color:#a6a6a6;stop-opacity:1;"
+ offset="1"
+ id="stop2874" />
+ </linearGradient>
+ <linearGradient
+ inkscape:collect="always"
+ id="linearGradient2830">
+ <stop
+ style="stop-color:white;stop-opacity:1;"
+ offset="0"
+ id="stop2832" />
+ <stop
+ style="stop-color:white;stop-opacity:0.1254902"
+ offset="1"
+ id="stop2834" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient2804">
+ <stop
+ id="stop2806"
+ offset="0"
+ style="stop-color:#9b9b9b;stop-opacity:1;" />
+ <stop
+ id="stop2808"
+ offset="1"
+ style="stop-color:#444;stop-opacity:1;" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient2789">
+ <stop
+ id="stop2791"
+ offset="0"
+ style="stop-color:#4793ff;stop-opacity:1;" />
+ <stop
+ id="stop2793"
+ offset="1"
+ style="stop-color:#002cc3;stop-opacity:1;" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient2768">
+ <stop
+ style="stop-color:#5f5f5f;stop-opacity:1;"
+ offset="0"
+ id="stop2770" />
+ <stop
+ style="stop-color:black;stop-opacity:1;"
+ offset="1"
+ id="stop2772" />
+ </linearGradient>
+ <radialGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2768"
+ id="radialGradient2780"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.819915,0.825713,-4.242328,4.218819,108.7504,-85.37568)"
+ cx="24.99999"
+ cy="21.500006"
+ fx="24.988815"
+ fy="20.717813"
+ r="9.9999924" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2830"
+ id="linearGradient2836"
+ x1="21.333336"
+ y1="3.9222705"
+ x2="27.189482"
+ y2="39.764923"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.845056,0.845056,-0.937726,0.937726,37.14432,-15.46766)" />
+ <radialGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2904"
+ id="radialGradient2910"
+ cx="20.615717"
+ cy="19.266575"
+ fx="20.615717"
+ fy="19.266575"
+ r="2.5271387"
+ gradientTransform="matrix(0.887673,0.939084,-7.191088,6.869715,157.5193,-131.9547)"
+ gradientUnits="userSpaceOnUse" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2882"
+ id="linearGradient2943"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(-0.300031,-0.300031,-0.937726,0.937726,65.90146,13.4681)"
+ x1="32.602097"
+ y1="32.222221"
+ x2="46.191555"
+ y2="39.523808" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2882"
+ id="linearGradient2985"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.300031,0.300031,-0.937726,0.937726,50.375,-1.939296)"
+ x1="32.602097"
+ y1="32.222221"
+ x2="46.191555"
+ y2="39.523808" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2882"
+ id="linearGradient2989"
+ gradientUnits="userSpaceOnUse"
+ x1="17.006248"
+ y1="34.714287"
+ x2="6.4563808"
+ y2="38.523811"
+ gradientTransform="matrix(-0.801316,-0.801316,-0.937726,0.937726,78.61505,25.91377)" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2870"
+ id="linearGradient2993"
+ gradientUnits="userSpaceOnUse"
+ x1="6.4938741"
+ y1="37.253971"
+ x2="17.035713"
+ y2="31.984127"
+ gradientTransform="matrix(0.801316,0.801316,-0.937726,0.937726,38.23962,-14.37236)" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient3120"
+ id="linearGradient3126"
+ x1="41.179733"
+ y1="42.642097"
+ x2="46.466469"
+ y2="46.35638"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.707107,0.707107,-0.707107,0.707107,36.39568,-16.50071)" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient3254"
+ id="linearGradient3260"
+ x1="58.910931"
+ y1="2.7760141"
+ x2="61.007938"
+ y2="5.1569667"
+ gradientUnits="userSpaceOnUse" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient3372"
+ id="linearGradient3378"
+ x1="19.142857"
+ y1="45.142857"
+ x2="27"
+ y2="36.285713"
+ gradientUnits="userSpaceOnUse" />
+ </defs>
+ <sodipodi:namedview
+ id="base"
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1.0"
+ inkscape:pageopacity="0.0"
+ inkscape:pageshadow="2"
+ inkscape:zoom="7"
+ inkscape:cx="33.904762"
+ inkscape:cy="28.845689"
+ inkscape:current-layer="layer1"
+ showgrid="false"
+ inkscape:grid-bbox="false"
+ inkscape:document-units="px"
+ inkscape:window-width="1323"
+ inkscape:window-height="1097"
+ inkscape:window-x="286"
+ inkscape:window-y="-2"
+ width="64px"
+ height="-64px"
+ showborder="true" />
+ <metadata
+ id="metadata1877">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <g
+ id="layer1"
+ inkscape:label="Layer 1"
+ inkscape:groupmode="layer">
+ <path
+ id="path3158"
+ style="color:black;fill:#f30;fill-opacity:1;fill-rule:nonzero;stroke:white;stroke-width:1.49999976;stroke-linecap:butt;stroke-linejoin:miter;marker:none;marker-start:none;marker-mid:none;marker-end:none;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;visibility:visible;display:inline;overflow:visible"
+ d="M 60.76189,30.857133 C 60.76189,46.207987 48.303227,58.666651 32.952373,58.666651 C 17.60152,58.666651 5.1428554,46.207987 5.1428554,30.857133 C 5.1428554,15.506279 17.60152,3.0476169 32.952373,3.0476169 C 48.303227,3.0476169 60.76189,15.506279 60.76189,30.857133 z M 53.835608,43.359654 C 59.467447,33.342069 54.592569,19.745662 42.954184,13.010532 C 31.315799,6.2754018 17.299384,8.9394241 11.667545,18.957011 C 6.0357065,28.974596 10.910585,42.571002 22.54897,49.306134 C 34.187355,56.041264 48.203769,53.377239 53.835608,43.359654 z " />
+ <path
+ style="fill:url(#linearGradient2989);fill-opacity:1;fill-rule:evenodd;stroke:url(#linearGradient3126);stroke-width:1.50000036;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+ d="M 42.626074,35.650097 C 42.626074,35.650097 43.66315,46.823546 40.638452,52.595613 C 37.613753,58.367679 31.65107,62.898598 27.208288,63.278408 L 23.087236,59.157356 C 23.087236,59.157356 28.612867,55.246187 29.292841,52.08595 C 29.972816,48.925714 25.807135,46.516407 25.807135,46.516407 L 42.626074,35.650097 z "
+ id="path2959"
+ sodipodi:nodetypes="czcczcc" />
+ <path
+ style="fill:none;fill-opacity:1;fill-rule:evenodd;stroke:white;stroke-width:1.50000024;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.75294118"
+ d="M 41.415401,37.015574 C 41.415401,37.015574 41.519251,47.582288 39.169121,52.032767 C 36.818989,56.483244 30.612907,61.588367 27.626439,61.433754 L 25.462896,59.270211 C 25.462896,59.270211 29.750463,55.79718 30.63454,52.380098 C 31.553191,48.829377 27.913031,46.086607 27.913031,46.086607 L 41.415401,37.015574 z "
+ id="path2961"
+ sodipodi:nodetypes="czcczcc" />
+ <path
+ style="fill:url(#linearGradient2943);fill-opacity:1;fill-rule:evenodd;stroke:#2e3436;stroke-width:1.49999976;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+ d="M 32.882714,26.263966 C 32.882714,26.263966 25.528575,29.046204 20.615854,32.930247 C 15.703135,36.81429 10.724376,42.329132 9.0737156,45.501064 L 10.616731,47.04408 C 10.616731,47.04408 14.558768,42.859158 16.943208,41.403387 C 19.327646,39.947615 20.154488,41.22099 20.154488,41.22099 L 32.882714,26.263966 z "
+ id="path2935"
+ sodipodi:nodetypes="czcczcc" />
+ <path
+ style="fill:#f30;fill-opacity:1;fill-rule:evenodd;stroke:white;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;stroke-miterlimit:4;stroke-dasharray:none"
+ d="M 17.428571,45.809524 C 17.428571,45.809524 9.4015395,44.790567 6.1904762,50.190476 C 3.1394595,55.321241 5.3077553,56.549478 0.19047619,64.095238 C 8.2367444,59.358933 9.6375886,61.426505 14.380952,57.809524 C 19.376743,54.000058 17.428571,45.809524 17.428571,45.809524 z "
+ id="path3262"
+ sodipodi:nodetypes="cscsc" />
+ <path
+ style="fill:url(#linearGradient2993);fill-opacity:1;fill-rule:evenodd;stroke:#2e3436;stroke-width:1.50000036;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+ d="M 28.50328,21.616611 C 28.50328,21.616611 17.32983,20.579534 11.557765,23.604232 C 5.7856986,26.62893 1.2547794,32.591615 0.87496908,37.034396 L 4.9960219,41.155449 C 4.9960219,41.155449 8.9071898,35.629817 12.067427,34.949842 C 15.227665,34.269868 17.636969,38.435549 17.636969,38.435549 L 28.50328,21.616611 z "
+ id="path2838"
+ sodipodi:nodetypes="czcczcc" />
+ <path
+ style="fill:none;fill-opacity:1;fill-rule:evenodd;stroke:white;stroke-width:1.50000024;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.75294118"
+ d="M 27.07046,23.029314 C 27.07046,23.029314 16.571089,22.723434 12.120612,25.073565 C 7.6701337,27.423697 2.9690707,33.09103 2.5849362,36.481558 L 4.8158227,38.847133 C 4.8158227,38.847133 8.1541689,34.290191 11.571249,33.406115 C 15.12197,32.487462 18.06677,36.329654 18.06677,36.329654 L 27.07046,23.029314 z "
+ id="path2852"
+ sodipodi:nodetypes="czcczcc" />
+ <path
+ style="fill:url(#radialGradient2780);fill-opacity:1;fill-rule:evenodd;stroke:#2e3436;stroke-width:1.50000024;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+ d="M 15.362051,40.711571 C 15.362051,40.711571 24.592712,23.530522 34.263857,14.118755 C 43.936021,4.7059904 57.497432,6.5153052 57.497432,6.5153052 C 57.497432,6.5153052 59.409726,20.154194 49.893983,29.748882 C 40.378649,39.343156 23.247767,48.704082 23.247767,48.704082 C 19.580003,46.984191 16.984432,44.23841 15.362051,40.711571 z "
+ id="path2778"
+ sodipodi:nodetypes="czczcc" />
+ <path
+ sodipodi:type="arc"
+ style="opacity:1;color:black;fill:black;fill-opacity:0.75294118;fill-rule:nonzero;stroke:none;stroke-width:1.00000012;stroke-linecap:butt;stroke-linejoin:miter;marker:none;marker-start:none;marker-mid:none;marker-end:none;stroke-miterlimit:4.9000001;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;visibility:visible;display:inline;overflow:visible"
+ id="path2995"
+ sodipodi:cx="27.047619"
+ sodipodi:cy="16.063492"
+ sodipodi:rx="6.2857141"
+ sodipodi:ry="6.2857141"
+ d="M 33.333333 16.063492 A 6.2857141 6.2857141 0 1 1 20.761905,16.063492 A 6.2857141 6.2857141 0 1 1 33.333333 16.063492 z"
+ transform="matrix(0.971755,0.971755,-1.078319,1.078319,34.30673,-22.81046)" />
+ <path
+ style="fill:none;fill-opacity:1;fill-rule:evenodd;stroke:url(#linearGradient2836);stroke-width:1.50000048;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.50196078"
+ d="M 17.248983,40.52583 C 17.248983,40.52583 26.172977,23.984103 34.984707,15.407716 C 43.797368,6.8304182 55.706728,8.3594072 55.706728,8.3594072 C 55.706728,8.3594072 57.335627,20.225286 48.664843,28.967697 C 39.994432,37.709734 23.644535,46.661465 23.644535,46.661465 C 21.457284,45.891861 18.359082,42.478174 17.248983,40.52583 z "
+ id="path2812"
+ sodipodi:nodetypes="czczcc" />
+ <path
+ style="fill:url(#radialGradient2910);fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ d="M 54.548251,8.8426932 C 54.548251,8.8426932 46.888197,8.2797042 39.841232,14.412999 C 32.794267,20.546293 22.246535,36.793595 22.246535,36.793595 C 22.246535,36.793595 35.535282,23.287308 42.582246,17.154013 C 49.629212,11.020718 54.548251,8.8426932 54.548251,8.8426932 z "
+ id="path2776"
+ sodipodi:nodetypes="czczc" />
+ <g
+ id="g3116"
+ transform="matrix(1.14993,0,0,1.14993,6.800306,-2.240887)">
+ <path
+ d="M 34.34296,19.103943 C 34.34296,21.473198 32.456312,23.396072 30.131691,23.396072 C 27.807073,23.396072 25.920426,21.473198 25.920426,19.103943 C 25.920426,16.734686 27.807073,14.811811 30.131691,14.811811 C 32.456312,14.811811 34.34296,16.734686 34.34296,19.103943 z M 32.559792,19.065261 C 32.559792,17.677859 31.454998,16.551853 30.093735,16.551853 C 28.73247,16.551853 27.627678,17.677859 27.627678,19.065261 C 27.627678,20.452665 28.73247,21.578673 30.093735,21.578673 C 31.454998,21.578673 32.559792,20.452665 32.559792,19.065261 z "
+ style="color:black;fill:#eeeeec;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:1.00000012;stroke-linecap:butt;stroke-linejoin:miter;marker:none;marker-start:none;marker-mid:none;marker-end:none;stroke-miterlimit:4.9000001;stroke-dashoffset:0;stroke-opacity:1;visibility:visible;display:inline;overflow:visible"
+ id="path2997" />
+ <rect
+ ry="1.0419143"
+ transform="matrix(0.669447,0.74286,-0.669447,0.74286,0,0)"
+ y="-10.85847"
+ x="38.385689"
+ height="2.0838287"
+ width="5.2897201"
+ id="rect3021"
+ style="opacity:1;color:black;fill:#eeeeec;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:1.00000012;stroke-linecap:butt;stroke-linejoin:miter;marker:none;marker-start:none;marker-mid:none;marker-end:none;stroke-miterlimit:4.9000001;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;visibility:visible;display:inline;overflow:visible" />
+ </g>
+ <path
+ sodipodi:type="arc"
+ style="opacity:1;color:black;fill:white;fill-opacity:0.21960784;fill-rule:nonzero;stroke:none;stroke-width:1.00000012;stroke-linecap:butt;stroke-linejoin:miter;marker:none;marker-start:none;marker-mid:none;marker-end:none;stroke-miterlimit:4.9000001;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;visibility:visible;display:inline;overflow:visible"
+ id="path3130"
+ sodipodi:cx="29.666666"
+ sodipodi:cy="20.238094"
+ sodipodi:rx="3.1904762"
+ sodipodi:ry="3.1904762"
+ d="M 32.857142 20.238094 A 3.1904762 3.1904762 0 1 1 26.47619,20.238094 A 3.1904762 3.1904762 0 1 1 32.857142 20.238094 z"
+ transform="translate(11.67337,-0.482171)" />
+ <path
+ style="fill:white;fill-opacity:1;fill-rule:evenodd;stroke:url(#linearGradient3260);stroke-width:1px;stroke-linecap:butt;stroke-linejoin:round;stroke-opacity:1"
+ d="M 55.887125,5.978836 C 56.414317,6.800748 56.847426,7.6885184 58.098765,8.0035273 L 63.650794,0.40564374 L 55.887125,5.978836 z "
+ id="path3252"
+ sodipodi:nodetypes="cccc" />
+ <path
+ style="fill:url(#linearGradient3378);fill-opacity:1.0;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ d="M 16.285714,40.714286 C 16.285714,40.714286 17.285714,43.392857 19.285714,45.142857 C 21.285714,46.892857 24.285714,47.714286 24.285714,47.714286 L 35.857143,40.285714 L 23.285714,28.571429 L 16.285714,40.714286 z "
+ id="path3362"
+ sodipodi:nodetypes="czcccc" />
+ <path
+ style="fill:url(#linearGradient2985);fill-opacity:1;fill-rule:evenodd;stroke:#181c1d;stroke-width:1.49999976;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+ d="M 37.579113,31.079442 C 37.579113,31.079442 34.796876,38.43358 30.912832,43.3463 C 27.02879,48.259021 21.513946,53.23778 18.342014,54.888441 L 16.798998,53.345425 C 16.798998,53.345425 20.98392,49.403388 22.439691,47.018947 C 23.895464,44.634508 22.622088,43.807668 22.622088,43.807668 L 37.579113,31.079442 z "
+ id="path2913"
+ sodipodi:nodetypes="czcczcc" />
+ <path
+ style="fill:none;fill-opacity:1;fill-rule:evenodd;stroke:white;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:5.2;stroke-dasharray:none;stroke-opacity:0.75294118"
+ d="M 34.306022,35.463077 C 34.306022,35.463077 32.674913,39.054685 30.182556,42.503764 C 29.189697,43.877743 25.069033,48.167451 18.268552,53.588983 C 22.922707,48.874967 23.670033,47.46904 23.987344,46.934259 C 24.304656,46.399476 24.555523,44.425075 24.261744,44.01222 L 34.306022,35.463077 z "
+ id="path2917"
+ sodipodi:nodetypes="csczcc" />
+ <path
+ style="fill:white;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+ d="M 16.849661,46.487458 C 16.849661,46.487458 11.421623,47.862954 9.670382,50.370906 C 8.5940231,51.91236 9.0037508,53.439525 5.6912911,58.323961 C 10.899711,55.25811 11.994885,55.689708 13.829292,53.778974 C 16.203694,51.310651 16.849661,46.487458 16.849661,46.487458 z "
+ id="path3360"
+ sodipodi:nodetypes="cscsc" />
+ </g>
+</svg>
diff --git a/files/whoosh_16.png b/files/whoosh_16.png
new file mode 100644
index 0000000..b0db497
--- /dev/null
+++ b/files/whoosh_16.png
Binary files differ
diff --git a/files/whoosh_35.png b/files/whoosh_35.png
new file mode 100644
index 0000000..7fc7675
--- /dev/null
+++ b/files/whoosh_35.png
Binary files differ
diff --git a/files/whoosh_64.png b/files/whoosh_64.png
new file mode 100644
index 0000000..a026b6d
--- /dev/null
+++ b/files/whoosh_64.png
Binary files differ
diff --git a/files/whoosh_small.svg b/files/whoosh_small.svg
new file mode 100644
index 0000000..0d967b9
--- /dev/null
+++ b/files/whoosh_small.svg
@@ -0,0 +1,604 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:xlink="http://www.w3.org/1999/xlink"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="16"
+ height="16"
+ id="svg1872"
+ sodipodi:version="0.32"
+ inkscape:version="0.46"
+ sodipodi:docbase="e:\dev_clean\src\houdini\support\icons\misc"
+ sodipodi:docname="whoosh_small.svg"
+ version="1.0"
+ inkscape:output_extension="org.inkscape.output.svg.inkscape"
+ inkscape:export-filename="C:\Documents and Settings\matt\Desktop\whoosh_small.png"
+ inkscape:export-xdpi="90"
+ inkscape:export-ydpi="90">
+ <defs
+ id="defs1874">
+ <inkscape:perspective
+ sodipodi:type="inkscape:persp3d"
+ inkscape:vp_x="0 : 32 : 1"
+ inkscape:vp_y="0 : 1000 : 0"
+ inkscape:vp_z="64 : 32 : 1"
+ inkscape:persp3d-origin="32 : 21.333333 : 1"
+ id="perspective11831" />
+ <linearGradient
+ inkscape:collect="always"
+ id="linearGradient3372">
+ <stop
+ style="stop-color:white;stop-opacity:0.50196081"
+ offset="0"
+ id="stop3374" />
+ <stop
+ style="stop-color:white;stop-opacity:0;"
+ offset="1"
+ id="stop3376" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient3264">
+ <stop
+ style="stop-color:#f4bf00;stop-opacity:1"
+ offset="0"
+ id="stop3266" />
+ <stop
+ style="stop-color:#ae0000;stop-opacity:1"
+ offset="1"
+ id="stop3268" />
+ </linearGradient>
+ <linearGradient
+ inkscape:collect="always"
+ id="linearGradient3254">
+ <stop
+ style="stop-color:#888a85;stop-opacity:1"
+ offset="0"
+ id="stop3256" />
+ <stop
+ style="stop-color:black;stop-opacity:1"
+ offset="1"
+ id="stop3258" />
+ </linearGradient>
+ <linearGradient
+ inkscape:collect="always"
+ id="linearGradient3120">
+ <stop
+ style="stop-color:black;stop-opacity:1"
+ offset="0"
+ id="stop3122" />
+ <stop
+ style="stop-color:#2e3436;stop-opacity:1"
+ offset="1"
+ id="stop3124" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient3102">
+ <stop
+ id="stop3104"
+ offset="0"
+ style="stop-color:#f30;stop-opacity:1;" />
+ <stop
+ style="stop-color:#ff967c;stop-opacity:1;"
+ offset="0.5"
+ id="stop3106" />
+ <stop
+ id="stop3108"
+ offset="1"
+ style="stop-color:#f30;stop-opacity:1" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient3092">
+ <stop
+ style="stop-color:#f30;stop-opacity:1;"
+ offset="0"
+ id="stop3094" />
+ <stop
+ id="stop3100"
+ offset="0.5"
+ style="stop-color:#831a00;stop-opacity:1;" />
+ <stop
+ style="stop-color:#f30;stop-opacity:1"
+ offset="1"
+ id="stop3096" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient2904">
+ <stop
+ style="stop-color:white;stop-opacity:0.50196081;"
+ offset="0"
+ id="stop2906" />
+ <stop
+ style="stop-color:white;stop-opacity:0;"
+ offset="1"
+ id="stop2908" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient2882">
+ <stop
+ style="stop-color:#5e5e5e;stop-opacity:1;"
+ offset="0"
+ id="stop2884" />
+ <stop
+ id="stop2890"
+ offset="0.3392857"
+ style="stop-color:#bbb;stop-opacity:1;" />
+ <stop
+ style="stop-color:white;stop-opacity:1;"
+ offset="1"
+ id="stop2886" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient2870">
+ <stop
+ style="stop-color:#c8c8c8;stop-opacity:1;"
+ offset="0"
+ id="stop2872" />
+ <stop
+ id="stop2878"
+ offset="0.5"
+ style="stop-color:#f5f5f5;stop-opacity:1;" />
+ <stop
+ style="stop-color:#a6a6a6;stop-opacity:1;"
+ offset="1"
+ id="stop2874" />
+ </linearGradient>
+ <linearGradient
+ inkscape:collect="always"
+ id="linearGradient2830">
+ <stop
+ style="stop-color:white;stop-opacity:1;"
+ offset="0"
+ id="stop2832" />
+ <stop
+ style="stop-color:white;stop-opacity:0.1254902"
+ offset="1"
+ id="stop2834" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient2804">
+ <stop
+ id="stop2806"
+ offset="0"
+ style="stop-color:#9b9b9b;stop-opacity:1;" />
+ <stop
+ id="stop2808"
+ offset="1"
+ style="stop-color:#444;stop-opacity:1;" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient2789">
+ <stop
+ id="stop2791"
+ offset="0"
+ style="stop-color:#4793ff;stop-opacity:1;" />
+ <stop
+ id="stop2793"
+ offset="1"
+ style="stop-color:#002cc3;stop-opacity:1;" />
+ </linearGradient>
+ <linearGradient
+ id="linearGradient2768">
+ <stop
+ style="stop-color:#5f5f5f;stop-opacity:1;"
+ offset="0"
+ id="stop2770" />
+ <stop
+ style="stop-color:black;stop-opacity:1;"
+ offset="1"
+ id="stop2772" />
+ </linearGradient>
+ <radialGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2768"
+ id="radialGradient2780"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.819915,0.825713,-4.242328,4.218819,108.7504,-85.37568)"
+ cx="24.99999"
+ cy="21.500006"
+ fx="24.988815"
+ fy="20.717813"
+ r="9.9999924" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2830"
+ id="linearGradient2836"
+ x1="21.333336"
+ y1="3.9222705"
+ x2="27.189482"
+ y2="39.764923"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.845056,0.845056,-0.937726,0.937726,37.14432,-15.46766)" />
+ <radialGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2904"
+ id="radialGradient2910"
+ cx="20.615717"
+ cy="19.266575"
+ fx="20.615717"
+ fy="19.266575"
+ r="2.5271387"
+ gradientTransform="matrix(0.887673,0.939084,-7.191088,6.869715,157.5193,-131.9547)"
+ gradientUnits="userSpaceOnUse" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2882"
+ id="linearGradient2943"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(-0.300031,-0.300031,-0.937726,0.937726,65.90146,13.4681)"
+ x1="32.602097"
+ y1="32.222221"
+ x2="46.191555"
+ y2="39.523808" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2882"
+ id="linearGradient2985"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.300031,0.300031,-0.937726,0.937726,50.375,-1.939296)"
+ x1="32.602097"
+ y1="32.222221"
+ x2="46.191555"
+ y2="39.523808" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2882"
+ id="linearGradient2989"
+ gradientUnits="userSpaceOnUse"
+ x1="17.006248"
+ y1="34.714287"
+ x2="6.4563808"
+ y2="38.523811"
+ gradientTransform="matrix(-0.801316,-0.801316,-0.937726,0.937726,78.61505,25.91377)" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2870"
+ id="linearGradient2993"
+ gradientUnits="userSpaceOnUse"
+ x1="6.4938741"
+ y1="37.253971"
+ x2="17.035713"
+ y2="31.984127"
+ gradientTransform="matrix(0.801316,0.801316,-0.937726,0.937726,38.23962,-14.37236)" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient3120"
+ id="linearGradient3126"
+ x1="41.179733"
+ y1="42.642097"
+ x2="46.466469"
+ y2="46.35638"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.707107,0.707107,-0.707107,0.707107,36.39568,-16.50071)" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient3254"
+ id="linearGradient3260"
+ x1="58.910931"
+ y1="2.7760141"
+ x2="61.007938"
+ y2="5.1569667"
+ gradientUnits="userSpaceOnUse" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient3372"
+ id="linearGradient3378"
+ x1="19.142857"
+ y1="45.142857"
+ x2="27"
+ y2="36.285713"
+ gradientUnits="userSpaceOnUse" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2882"
+ id="linearGradient11853"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(-0.801316,-0.801316,-0.937726,0.937726,78.61505,25.91377)"
+ x1="17.006248"
+ y1="34.714287"
+ x2="6.4563808"
+ y2="38.523811" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient3120"
+ id="linearGradient11855"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.707107,0.707107,-0.707107,0.707107,36.39568,-16.50071)"
+ x1="41.179733"
+ y1="42.642097"
+ x2="46.466469"
+ y2="46.35638" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2882"
+ id="linearGradient11857"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(-0.300031,-0.300031,-0.937726,0.937726,65.90146,13.4681)"
+ x1="32.602097"
+ y1="32.222221"
+ x2="46.191555"
+ y2="39.523808" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2870"
+ id="linearGradient11859"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.801316,0.801316,-0.937726,0.937726,38.23962,-14.37236)"
+ x1="6.4938741"
+ y1="37.253971"
+ x2="17.035713"
+ y2="31.984127" />
+ <radialGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2768"
+ id="radialGradient11861"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.819915,0.825713,-4.242328,4.218819,108.7504,-85.37568)"
+ cx="24.99999"
+ cy="21.500006"
+ fx="24.988815"
+ fy="20.717813"
+ r="9.9999924" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2830"
+ id="linearGradient11863"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.845056,0.845056,-0.937726,0.937726,37.14432,-15.46766)"
+ x1="21.333336"
+ y1="3.9222705"
+ x2="27.189482"
+ y2="39.764923" />
+ <radialGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2904"
+ id="radialGradient11865"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.887673,0.939084,-7.191088,6.869715,157.5193,-131.9547)"
+ cx="20.615717"
+ cy="19.266575"
+ fx="20.615717"
+ fy="19.266575"
+ r="2.5271387" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient3372"
+ id="linearGradient11867"
+ gradientUnits="userSpaceOnUse"
+ x1="19.142857"
+ y1="45.142857"
+ x2="27"
+ y2="36.285713" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2882"
+ id="linearGradient11869"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.300031,0.300031,-0.937726,0.937726,50.375,-1.939296)"
+ x1="32.602097"
+ y1="32.222221"
+ x2="46.191555"
+ y2="39.523808" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2882"
+ id="linearGradient11874"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(8.0727091e-2,8.0727091e-2,-0.2523069,0.2523069,13.716475,-1.8382623)"
+ x1="32.602097"
+ y1="32.222221"
+ x2="46.191555"
+ y2="39.523808" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient3372"
+ id="linearGradient11877"
+ gradientUnits="userSpaceOnUse"
+ x1="19.142857"
+ y1="45.142857"
+ x2="27"
+ y2="36.285713"
+ gradientTransform="matrix(0.2690625,0,0,0.2690625,0.1624517,-1.3164705)" />
+ <radialGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2904"
+ id="radialGradient11884"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.2388395,0.2526723,-1.9348521,1.8483827,42.544988,-36.820532)"
+ cx="20.615717"
+ cy="19.266575"
+ fx="20.615717"
+ fy="19.266575"
+ r="2.5271387" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2830"
+ id="linearGradient11887"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.2273729,0.2273729,-0.2523069,0.2523069,10.156595,-5.4782378)"
+ x1="21.333336"
+ y1="3.9222705"
+ x2="27.189482"
+ y2="39.764923" />
+ <radialGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2768"
+ id="radialGradient11891"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.2206084,0.2221684,-1.1414514,1.135126,29.423106,-24.287864)"
+ cx="24.99999"
+ cy="21.500006"
+ fx="24.988815"
+ fy="20.717813"
+ r="9.9999924" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2870"
+ id="linearGradient11895"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.2156041,0.2156041,-0.2523069,0.2523069,10.451299,-5.1835336)"
+ x1="6.4938741"
+ y1="37.253971"
+ x2="17.035713"
+ y2="31.984127" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2882"
+ id="linearGradient11899"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(-8.0727091e-2,-8.0727091e-2,-0.2523069,0.2523069,17.894063,2.3072902)"
+ x1="32.602097"
+ y1="32.222221"
+ x2="46.191555"
+ y2="39.523808" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient2882"
+ id="linearGradient11903"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(-0.2156041,-0.2156041,-0.2523069,0.2523069,21.314814,5.6559532)"
+ x1="17.006248"
+ y1="34.714287"
+ x2="6.4563808"
+ y2="38.523811" />
+ <linearGradient
+ inkscape:collect="always"
+ xlink:href="#linearGradient3120"
+ id="linearGradient11905"
+ gradientUnits="userSpaceOnUse"
+ gradientTransform="matrix(0.190256,0.190256,-0.190256,0.190256,9.9551643,-5.7561928)"
+ x1="41.179733"
+ y1="42.642097"
+ x2="46.466469"
+ y2="46.35638" />
+ </defs>
+ <sodipodi:namedview
+ id="base"
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1.0"
+ inkscape:pageopacity="0.0"
+ inkscape:pageshadow="2"
+ inkscape:zoom="23.625"
+ inkscape:cx="1.1534392"
+ inkscape:cy="6.1261123"
+ inkscape:current-layer="layer1"
+ showgrid="false"
+ inkscape:grid-bbox="false"
+ inkscape:document-units="px"
+ inkscape:window-width="1323"
+ inkscape:window-height="1097"
+ inkscape:window-x="88"
+ inkscape:window-y="74"
+ width="64px"
+ height="-64px"
+ showborder="true" />
+ <metadata
+ id="metadata1877">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <g
+ id="layer1"
+ inkscape:label="Layer 1"
+ inkscape:groupmode="layer">
+ <path
+ style="fill:url(#linearGradient11903);fill-opacity:1;fill-rule:evenodd;stroke:url(#linearGradient11905);stroke-width:0.75;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+ d="M 11.63153,8.2756337 C 11.63153,8.2756337 11.910568,11.28199 11.096735,12.835037 C 10.282902,14.388083 8.6785677,15.607184 7.4831817,15.709376 L 6.3743611,14.600556 C 6.3743611,14.600556 7.8611012,13.548207 8.0440567,12.697905 C 8.2270125,11.847604 7.106184,11.19935 7.106184,11.19935 L 11.63153,8.2756337 z"
+ id="path2959"
+ sodipodi:nodetypes="czcczcc" />
+ <path
+ style="fill:url(#linearGradient11899);fill-opacity:1;fill-rule:evenodd;stroke:#2e3436;stroke-width:1.49999964;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+ d="M 9.0099569,5.7501779 C 9.0099569,5.7501779 7.0312339,6.4987738 5.7094049,7.5438241 C 4.3875765,8.5888744 3.0479791,10.072712 2.6038483,10.92616 L 3.0190159,11.341327 C 3.0190159,11.341327 4.0796702,10.215322 4.7212336,9.8236283 C 5.3627965,9.4319347 5.5852686,9.7745521 5.5852686,9.7745521 L 9.0099569,5.7501779 z"
+ id="path2935"
+ sodipodi:nodetypes="czcczcc" />
+ <path
+ style="fill:#ff3300;fill-opacity:1;fill-rule:evenodd;stroke:#ffffff;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+ d="M 5.1865598,10.590812 C 5.1865598,10.590812 2.5066555,10.250879 1.4346102,12.052328 C 0.41599791,13.76399 1.139905,14.173738 -0.56854943,16.691059 C 2.1177771,15.110993 2.5854629,15.800751 4.1690819,14.5941 C 5.8369763,13.323233 5.1865598,10.590812 5.1865598,10.590812 z"
+ id="path3262"
+ sodipodi:nodetypes="cscsc" />
+ <path
+ style="fill:url(#linearGradient11895);fill-opacity:1;fill-rule:evenodd;stroke:#2e3436;stroke-width:0.75;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+ d="M 7.8316155,4.4997489 C 7.8316155,4.4997489 4.8252591,4.2207104 3.2722128,5.0345432 C 1.7191662,5.848376 0.50006578,7.4527109 0.39787307,8.6480967 L 1.5066938,9.7569175 C 1.5066938,9.7569175 2.5590425,8.2701771 3.4093438,8.0872214 C 4.2596453,7.9042659 4.9078987,9.0250944 4.9078987,9.0250944 L 7.8316155,4.4997489 z"
+ id="path2838"
+ sodipodi:nodetypes="czcczcc" />
+ <path
+ style="fill:url(#radialGradient11891);fill-opacity:1;fill-rule:evenodd;stroke:#2e3436;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+ d="M 4.2958035,9.6374866 C 4.2958035,9.6374866 6.7794283,5.0147106 9.3815707,2.482357 C 11.983987,-0.050264958 15.632854,0.43655381 15.632854,0.43655381 C 15.632854,0.43655381 16.147381,4.1062673 13.587052,6.6878381 C 11.026832,9.2692974 6.417554,11.787972 6.417554,11.787972 C 5.4306963,11.325213 4.7323254,10.586427 4.2958035,9.6374866 z"
+ id="path2778"
+ sodipodi:nodetypes="czczcc" />
+ <path
+ sodipodi:type="arc"
+ style="fill:#000000;fill-opacity:0.75294118;fill-rule:nonzero;stroke:none;stroke-width:1.00000012;stroke-linecap:butt;stroke-linejoin:miter;marker:none;marker-start:none;marker-mid:none;marker-end:none;stroke-miterlimit:4.9000001;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;visibility:visible;display:inline;overflow:visible"
+ id="path2995"
+ sodipodi:cx="27.047619"
+ sodipodi:cy="16.063492"
+ sodipodi:rx="6.2857141"
+ sodipodi:ry="6.2857141"
+ d="M 33.333333,16.063492 A 6.2857141,6.2857141 0 1 1 20.761905,16.063492 A 6.2857141,6.2857141 0 1 1 33.333333,16.063492 z"
+ transform="matrix(0.2614628,0.2614628,-0.2901352,0.2901352,9.3931062,-7.4539099)" />
+ <path
+ style="fill:url(#radialGradient11884);fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ d="M 14.83934,1.0627666 C 14.83934,1.0627666 12.778307,0.91128741 10.882233,2.561527 C 8.9861592,4.2117665 6.14816,8.5833062 6.14816,8.5833062 C 6.14816,8.5833062 9.7236635,4.9492708 11.619737,3.2990311 C 13.515812,1.6487914 14.83934,1.0627666 14.83934,1.0627666 z"
+ id="path2776"
+ sodipodi:nodetypes="czczc" />
+ <g
+ id="g3116"
+ transform="matrix(0.3570421,0,0,0.3570421,0.5148633,-2.794343)">
+ <path
+ d="M 34.34296,19.103943 C 34.34296,21.473198 32.456312,23.396072 30.131691,23.396072 C 27.807073,23.396072 25.920426,21.473198 25.920426,19.103943 C 25.920426,16.734686 27.807073,14.811811 30.131691,14.811811 C 32.456312,14.811811 34.34296,16.734686 34.34296,19.103943 z M 32.559792,19.065261 C 32.559792,17.677859 31.454998,16.551853 30.093735,16.551853 C 28.73247,16.551853 27.627678,17.677859 27.627678,19.065261 C 27.627678,20.452665 28.73247,21.578673 30.093735,21.578673 C 31.454998,21.578673 32.559792,20.452665 32.559792,19.065261 z"
+ style="fill:#eeeeec;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:1.00000012;stroke-linecap:butt;stroke-linejoin:miter;marker:none;marker-start:none;marker-mid:none;marker-end:none;stroke-miterlimit:4.9000001;stroke-dashoffset:0;stroke-opacity:1;visibility:visible;display:inline;overflow:visible"
+ id="path2997" />
+ <rect
+ ry="1.0419143"
+ transform="matrix(0.669447,0.74286,-0.669447,0.74286,0,0)"
+ y="-10.85847"
+ x="38.385689"
+ height="2.0838287"
+ width="5.2897201"
+ id="rect3021"
+ style="opacity:1;fill:#eeeeec;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:1.00000012;stroke-linecap:butt;stroke-linejoin:miter;marker:none;marker-start:none;marker-mid:none;marker-end:none;stroke-miterlimit:4.9000001;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;visibility:visible;display:inline;overflow:visible" />
+ </g>
+ <path
+ sodipodi:type="arc"
+ style="fill:#ffffff;fill-opacity:0.21960784;fill-rule:nonzero;stroke:none;stroke-width:1.00000012;stroke-linecap:butt;stroke-linejoin:miter;marker:none;marker-start:none;marker-mid:none;marker-end:none;stroke-miterlimit:4.9000001;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;visibility:visible;display:inline;overflow:visible"
+ id="path3130"
+ sodipodi:cx="29.666666"
+ sodipodi:cy="20.238094"
+ sodipodi:rx="3.1904762"
+ sodipodi:ry="3.1904762"
+ d="M 32.857142,20.238094 A 3.1904762,3.1904762 0 1 1 26.47619,20.238094 A 3.1904762,3.1904762 0 1 1 32.857142,20.238094 z"
+ transform="matrix(0.2690625,0,0,0.2690625,3.3033178,-1.4462046)" />
+ <path
+ style="fill:url(#linearGradient11877);fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ d="M 4.5443266,9.6382171 C 4.5443266,9.6382171 4.8133891,10.35892 5.3515141,10.829779 C 5.8896391,11.300639 6.6968266,11.521655 6.6968266,11.521655 L 9.8102642,9.5229044 L 6.4277641,6.3710296 L 4.5443266,9.6382171 z"
+ id="path3362"
+ sodipodi:nodetypes="czcccc" />
+ <path
+ style="fill:url(#linearGradient11874);fill-opacity:1;fill-rule:evenodd;stroke:#181c1d;stroke-width:1.49999964;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+ d="M 10.273582,7.0458419 C 10.273582,7.0458419 9.5249861,9.0245646 8.4799356,10.346393 C 7.4348855,11.668222 5.9510478,13.00782 5.0975998,13.451951 L 4.6824321,13.036783 C 4.6824321,13.036783 5.8084377,11.976129 6.2001311,11.334565 C 6.591825,10.693002 6.2492073,10.47053 6.2492073,10.47053 L 10.273582,7.0458419 z"
+ id="path2913"
+ sodipodi:nodetypes="czcczcc" />
+ <path
+ style="fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#ffffff;stroke-width:0.99999994;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:5.19999981;stroke-dasharray:none;stroke-opacity:0.75294118"
+ d="M 9.3929157,8.2253137 C 9.3929157,8.2253137 8.9540455,9.1916807 8.2834457,10.119699 C 8.0163045,10.489385 6.9075884,11.643584 5.077834,13.102315 C 6.3300926,11.83395 6.53117,11.455668 6.6165464,11.311779 C 6.7019232,11.167889 6.7694221,10.636651 6.6903772,10.525567 L 9.3929157,8.2253137 z"
+ id="path2917"
+ sodipodi:nodetypes="csczcc" />
+ <path
+ style="fill:#ffffff;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+ d="M 4.6114075,11.294224 C 4.6114075,11.294224 3.4391953,11.59127 3.061006,12.132874 C 2.8285608,12.465759 2.9170436,12.795559 2.2017012,13.850377 C 3.3264858,13.188291 3.5629941,13.281497 3.9591436,12.868864 C 4.4719077,12.335818 4.6114075,11.294224 4.6114075,11.294224 z"
+ id="path3360"
+ sodipodi:nodetypes="cscsc" />
+ </g>
+</svg>
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..22cc32e
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,40 @@
+[wheel]
+universal = 1
+
+[build_sphinx]
+build-dir = docs/build
+source-dir = docs/source
+
+[upload_sphinx]
+upload-dir = docs/build/html
+
+[sdist]
+formats = zip,gztar
+
+[aliases]
+push = sdist bdist_wheel upload
+pushdocs = build_sphinx upload_sphinx
+
+[pytest]
+addopts = -rs --tb=native
+norecursedirs = .hg .tox _build tmp* env* benchmark stress
+minversion = 2.0
+python_files = test_*.py
+pep8ignore =
+ *.py E121 E122 E123 E124 E125 E126 E127 E128 # continuation line indentation
+ *.py E401 # imports on separate lines
+ *.py W391 # blank line at end of file
+ test_*.py E501 # Ignore long lines in tests
+ upload.py ALL # 3rd party (and not in the repo): rietveld upload tool
+ docs/source/conf.py ALL # sphinx stuff, automatically generated, don't check this
+ src/whoosh/lang/*.py ALL # 3rd party / crashing py.test with non-ascii stuff
+ src/whoosh/lang/snowball/*.py ALL # 3rd party
+ src/whoosh/support/relativedelta.py ALL # 3rd party
+ src/whoosh/support/charset.py ALL # non-ascii py.test crash
+ src/whoosh/support/unicode.py ALL # non-ascii py.test crash
+
+[egg_info]
+tag_build =
+tag_date = 0
+tag_svn_revision = 0
+
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..1229018
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,60 @@
+#!python
+
+import os.path, sys
+from setuptools import setup, find_packages
+from setuptools.command.test import test as TestCommand
+
+try:
+ import pytest
+except ImportError:
+ pytest = None
+
+sys.path.insert(0, os.path.abspath("src"))
+from whoosh import __version__, versionstring
+
+
+class PyTest(TestCommand):
+ def finalize_options(self):
+ TestCommand.finalize_options(self)
+ self.test_args = []
+ self.test_suite = True
+
+ def run_tests(self):
+ #import here, cause outside the eggs aren't loaded
+ import pytest
+ pytest.main(self.test_args)
+
+
+if __name__ == "__main__":
+ setup(
+ name="Whoosh",
+ version=versionstring(),
+ package_dir={'': 'src'},
+ packages=find_packages("src"),
+
+ author="Matt Chaput",
+ author_email="matt@whoosh.ca",
+
+ description="Fast, pure-Python full text indexing, search, and spell checking library.",
+ long_description=open("README.txt").read(),
+
+ license="Two-clause BSD license",
+ keywords="index search text spell",
+ url="http://bitbucket.org/mchaput/whoosh",
+
+ zip_safe=True,
+ tests_require=['pytest'],
+ cmdclass={'test': PyTest},
+
+ classifiers=[
+ "Development Status :: 5 - Production/Stable",
+ "Intended Audience :: Developers",
+ "License :: OSI Approved :: BSD License",
+ "Natural Language :: English",
+ "Operating System :: OS Independent",
+ "Programming Language :: Python :: 2.5",
+ "Programming Language :: Python :: 3",
+ "Topic :: Software Development :: Libraries :: Python Modules",
+ "Topic :: Text Processing :: Indexing",
+ ],
+ )
diff --git a/src/Whoosh.egg-info/PKG-INFO b/src/Whoosh.egg-info/PKG-INFO
new file mode 100644
index 0000000..84d0d80
--- /dev/null
+++ b/src/Whoosh.egg-info/PKG-INFO
@@ -0,0 +1,88 @@
+Metadata-Version: 1.1
+Name: Whoosh
+Version: 2.7.0
+Summary: Fast, pure-Python full text indexing, search, and spell checking library.
+Home-page: http://bitbucket.org/mchaput/whoosh
+Author: Matt Chaput
+Author-email: matt@whoosh.ca
+License: Two-clause BSD license
+Description: About Whoosh
+ ============
+
+ Whoosh is a fast, featureful full-text indexing and searching library
+ implemented in pure Python. Programmers can use it to easily add search
+ functionality to their applications and websites. Every part of how Whoosh
+ works can be extended or replaced to meet your needs exactly.
+
+ Some of Whoosh's features include:
+
+ * Pythonic API.
+ * Pure-Python. No compilation or binary packages needed, no mysterious crashes.
+ * Fielded indexing and search.
+ * Fast indexing and retrieval -- faster than any other pure-Python, scoring,
+ full-text search solution I know of.
+ * Pluggable scoring algorithm (including BM25F), text analysis, storage,
+ posting format, etc.
+ * Powerful query language.
+ * Pure Python spell-checker (as far as I know, the only one).
+
+ Whoosh might be useful in the following circumstances:
+
+ * Anywhere a pure-Python solution is desirable to avoid having to build/compile
+ native libraries (or force users to build/compile them).
+ * As a research platform (at least for programmers that find Python easier to
+ read and work with than Java ;)
+ * When an easy-to-use Pythonic interface is more important to you than raw
+ speed.
+
+ Whoosh was created and is maintained by Matt Chaput. It was originally created
+ for use in the online help system of Side Effects Software's 3D animation
+ software Houdini. Side Effects Software Inc. graciously agreed to open-source
+ the code.
+
+ This software is licensed under the terms of the simplified BSD (A.K.A. "two
+ clause" or "FreeBSD") license. See LICENSE.txt for information.
+
+ Installing Whoosh
+ =================
+
+ If you have ``setuptools`` or ``pip`` installed, you can use ``easy_install``
+ or ``pip`` to download and install Whoosh automatically::
+
+ $ easy_install Whoosh
+
+ or
+
+ $ pip install Whoosh
+
+ Learning more
+ =============
+
+ * Read the online documentation at http://packages.python.org/Whoosh/
+
+ * Join the Whoosh mailing list at http://groups.google.com/group/whoosh
+
+ * File bug reports and view the Whoosh wiki at
+ http://bitbucket.org/mchaput/whoosh/
+
+ Getting the source
+ ==================
+
+ Download source releases from PyPI at http://pypi.python.org/pypi/Whoosh/
+
+ You can check out the latest version of the source code using Mercurial::
+
+ hg clone http://bitbucket.org/mchaput/whoosh
+
+
+Keywords: index search text spell
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 2.5
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing :: Indexing
diff --git a/src/Whoosh.egg-info/SOURCES.txt b/src/Whoosh.egg-info/SOURCES.txt
new file mode 100644
index 0000000..ad9c0d2
--- /dev/null
+++ b/src/Whoosh.egg-info/SOURCES.txt
@@ -0,0 +1,224 @@
+LICENSE.txt
+MANIFEST.in
+README.txt
+setup.cfg
+setup.py
+benchmark/dcvgr10.txt.gz
+benchmark/dictionary.py
+benchmark/enron.py
+benchmark/marc21.py
+benchmark/reuters.py
+benchmark/reuters21578.txt.gz
+docs/source/analysis.rst
+docs/source/batch.rst
+docs/source/conf.py
+docs/source/dates.rst
+docs/source/facets.rst
+docs/source/fieldcaches.rst
+docs/source/glossary.rst
+docs/source/highlight.rst
+docs/source/index.rst
+docs/source/indexing.rst
+docs/source/intro.rst
+docs/source/keywords.rst
+docs/source/nested.rst
+docs/source/ngrams.rst
+docs/source/parsing.rst
+docs/source/query.rst
+docs/source/querylang.rst
+docs/source/quickstart.rst
+docs/source/recipes.rst
+docs/source/schema.rst
+docs/source/searching.rst
+docs/source/spelling.rst
+docs/source/stemming.rst
+docs/source/threads.rst
+docs/source/api/analysis.rst
+docs/source/api/api.rst
+docs/source/api/collectors.rst
+docs/source/api/columns.rst
+docs/source/api/fields.rst
+docs/source/api/formats.rst
+docs/source/api/highlight.rst
+docs/source/api/idsets.rst
+docs/source/api/index.rst
+docs/source/api/matching.rst
+docs/source/api/qparser.rst
+docs/source/api/query.rst
+docs/source/api/reading.rst
+docs/source/api/scoring.rst
+docs/source/api/searching.rst
+docs/source/api/sorting.rst
+docs/source/api/spelling.rst
+docs/source/api/util.rst
+docs/source/api/writing.rst
+docs/source/api/codec/base.rst
+docs/source/api/filedb/filestore.rst
+docs/source/api/filedb/filetables.rst
+docs/source/api/filedb/structfile.rst
+docs/source/api/lang/morph_en.rst
+docs/source/api/lang/porter.rst
+docs/source/api/lang/wordnet.rst
+docs/source/api/support/charset.rst
+docs/source/api/support/levenshtein.rst
+docs/source/releases/0_3.rst
+docs/source/releases/1_0.rst
+docs/source/releases/2_0.rst
+docs/source/releases/index.rst
+docs/source/tech/backend.rst
+docs/source/tech/filedb.rst
+docs/source/tech/index.rst
+files/whoosh.svg
+files/whoosh_16.png
+files/whoosh_35.png
+files/whoosh_64.png
+files/whoosh_small.svg
+src/Whoosh.egg-info/PKG-INFO
+src/Whoosh.egg-info/SOURCES.txt
+src/Whoosh.egg-info/dependency_links.txt
+src/Whoosh.egg-info/top_level.txt
+src/Whoosh.egg-info/zip-safe
+src/whoosh/__init__.py
+src/whoosh/classify.py
+src/whoosh/collectors.py
+src/whoosh/columns.py
+src/whoosh/compat.py
+src/whoosh/externalsort.py
+src/whoosh/fields.py
+src/whoosh/formats.py
+src/whoosh/highlight.py
+src/whoosh/idsets.py
+src/whoosh/index.py
+src/whoosh/legacy.py
+src/whoosh/multiproc.py
+src/whoosh/reading.py
+src/whoosh/scoring.py
+src/whoosh/searching.py
+src/whoosh/sorting.py
+src/whoosh/spelling.py
+src/whoosh/system.py
+src/whoosh/writing.py
+src/whoosh/analysis/__init__.py
+src/whoosh/analysis/acore.py
+src/whoosh/analysis/analyzers.py
+src/whoosh/analysis/filters.py
+src/whoosh/analysis/intraword.py
+src/whoosh/analysis/morph.py
+src/whoosh/analysis/ngrams.py
+src/whoosh/analysis/tokenizers.py
+src/whoosh/automata/__init__.py
+src/whoosh/automata/fsa.py
+src/whoosh/automata/glob.py
+src/whoosh/automata/lev.py
+src/whoosh/automata/nfa.py
+src/whoosh/automata/reg.py
+src/whoosh/codec/__init__.py
+src/whoosh/codec/base.py
+src/whoosh/codec/memory.py
+src/whoosh/codec/plaintext.py
+src/whoosh/codec/whoosh3.py
+src/whoosh/filedb/__init__.py
+src/whoosh/filedb/compound.py
+src/whoosh/filedb/filestore.py
+src/whoosh/filedb/filetables.py
+src/whoosh/filedb/gae.py
+src/whoosh/filedb/structfile.py
+src/whoosh/lang/__init__.py
+src/whoosh/lang/dmetaphone.py
+src/whoosh/lang/isri.py
+src/whoosh/lang/lovins.py
+src/whoosh/lang/morph_en.py
+src/whoosh/lang/paicehusk.py
+src/whoosh/lang/phonetic.py
+src/whoosh/lang/porter.py
+src/whoosh/lang/porter2.py
+src/whoosh/lang/stopwords.py
+src/whoosh/lang/wordnet.py
+src/whoosh/lang/snowball/__init__.py
+src/whoosh/lang/snowball/bases.py
+src/whoosh/lang/snowball/danish.py
+src/whoosh/lang/snowball/dutch.py
+src/whoosh/lang/snowball/english.py
+src/whoosh/lang/snowball/finnish.py
+src/whoosh/lang/snowball/french.py
+src/whoosh/lang/snowball/german.py
+src/whoosh/lang/snowball/hungarian.py
+src/whoosh/lang/snowball/italian.py
+src/whoosh/lang/snowball/norwegian.py
+src/whoosh/lang/snowball/portugese.py
+src/whoosh/lang/snowball/romanian.py
+src/whoosh/lang/snowball/russian.py
+src/whoosh/lang/snowball/spanish.py
+src/whoosh/lang/snowball/swedish.py
+src/whoosh/matching/__init__.py
+src/whoosh/matching/binary.py
+src/whoosh/matching/combo.py
+src/whoosh/matching/mcore.py
+src/whoosh/matching/wrappers.py
+src/whoosh/qparser/__init__.py
+src/whoosh/qparser/common.py
+src/whoosh/qparser/dateparse.py
+src/whoosh/qparser/default.py
+src/whoosh/qparser/plugins.py
+src/whoosh/qparser/syntax.py
+src/whoosh/qparser/taggers.py
+src/whoosh/query/__init__.py
+src/whoosh/query/compound.py
+src/whoosh/query/nested.py
+src/whoosh/query/positional.py
+src/whoosh/query/qcolumns.py
+src/whoosh/query/qcore.py
+src/whoosh/query/ranges.py
+src/whoosh/query/spans.py
+src/whoosh/query/terms.py
+src/whoosh/query/wrappers.py
+src/whoosh/support/__init__.py
+src/whoosh/support/base85.py
+src/whoosh/support/bench.py
+src/whoosh/support/charset.py
+src/whoosh/support/levenshtein.py
+src/whoosh/support/relativedelta.py
+src/whoosh/support/unicode.py
+src/whoosh/util/__init__.py
+src/whoosh/util/cache.py
+src/whoosh/util/filelock.py
+src/whoosh/util/loading.py
+src/whoosh/util/numeric.py
+src/whoosh/util/numlists.py
+src/whoosh/util/testing.py
+src/whoosh/util/text.py
+src/whoosh/util/times.py
+src/whoosh/util/varints.py
+src/whoosh/util/versions.py
+tests/test_analysis.py
+tests/test_automata.py
+tests/test_bits.py
+tests/test_classify.py
+tests/test_codecs.py
+tests/test_collector.py
+tests/test_columns.py
+tests/test_compound.py
+tests/test_dateparse.py
+tests/test_fields.py
+tests/test_flexible.py
+tests/test_highlighting.py
+tests/test_indexing.py
+tests/test_matching.py
+tests/test_misc.py
+tests/test_mpwriter.py
+tests/test_nested.py
+tests/test_parse_plugins.py
+tests/test_parsing.py
+tests/test_postings.py
+tests/test_quality.py
+tests/test_queries.py
+tests/test_reading.py
+tests/test_results.py
+tests/test_searching.py
+tests/test_sorting.py
+tests/test_spans.py
+tests/test_spelling.py
+tests/test_tables.py
+tests/test_vectors.py
+tests/test_weightings.py
+tests/test_writing.py \ No newline at end of file
diff --git a/src/Whoosh.egg-info/dependency_links.txt b/src/Whoosh.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/Whoosh.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/src/Whoosh.egg-info/top_level.txt b/src/Whoosh.egg-info/top_level.txt
new file mode 100644
index 0000000..d752255
--- /dev/null
+++ b/src/Whoosh.egg-info/top_level.txt
@@ -0,0 +1 @@
+whoosh
diff --git a/src/Whoosh.egg-info/zip-safe b/src/Whoosh.egg-info/zip-safe
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/Whoosh.egg-info/zip-safe
@@ -0,0 +1 @@
+
diff --git a/src/whoosh/__init__.py b/src/whoosh/__init__.py
new file mode 100644
index 0000000..414f8bb
--- /dev/null
+++ b/src/whoosh/__init__.py
@@ -0,0 +1,49 @@
+# Copyright 2008 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+__version__ = (2, 7, 0)
+
+
+def versionstring(build=True, extra=True):
+ """Returns the version number of Whoosh as a string.
+
+ :param build: Whether to include the build number in the string.
+ :param extra: Whether to include alpha/beta/rc etc. tags. Only
+ checked if build is True.
+ :rtype: str
+ """
+
+ if build:
+ first = 3
+ else:
+ first = 2
+
+ s = ".".join(str(n) for n in __version__[:first])
+ if build and extra:
+ s += "".join(str(n) for n in __version__[3:])
+
+ return s
diff --git a/src/whoosh/analysis/__init__.py b/src/whoosh/analysis/__init__.py
new file mode 100644
index 0000000..66293bc
--- /dev/null
+++ b/src/whoosh/analysis/__init__.py
@@ -0,0 +1,69 @@
+# Copyright 2007 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+"""Classes and functions for turning a piece of text into an indexable stream
+of "tokens" (usually equivalent to words). There are three general classes
+involved in analysis:
+
+* Tokenizers are always at the start of the text processing pipeline. They take
+ a string and yield Token objects (actually, the same token object over and
+ over, for performance reasons) corresponding to the tokens (words) in the
+ text.
+
+ Every tokenizer is a callable that takes a string and returns an iterator of
+ tokens.
+
+* Filters take the tokens from the tokenizer and perform various
+ transformations on them. For example, the LowercaseFilter converts all tokens
+ to lowercase, which is usually necessary when indexing regular English text.
+
+ Every filter is a callable that takes a token generator and returns a token
+ generator.
+
+* Analyzers are convenience functions/classes that "package up" a tokenizer and
+ zero or more filters into a single unit. For example, the StandardAnalyzer
+ combines a RegexTokenizer, LowercaseFilter, and StopFilter.
+
+ Every analyzer is a callable that takes a string and returns a token
+ iterator. (So Tokenizers can be used as Analyzers if you don't need any
+ filtering).
+
+You can compose tokenizers and filters together using the ``|`` character::
+
+ my_analyzer = RegexTokenizer() | LowercaseFilter() | StopFilter()
+
+The first item must be a tokenizer and the rest must be filters (you can't put
+a filter first or a tokenizer after the first item).
+"""
+
+from whoosh.analysis.acore import *
+from whoosh.analysis.tokenizers import *
+from whoosh.analysis.filters import *
+from whoosh.analysis.morph import *
+from whoosh.analysis.intraword import *
+from whoosh.analysis.ngrams import *
+from whoosh.analysis.analyzers import *
diff --git a/src/whoosh/analysis/acore.py b/src/whoosh/analysis/acore.py
new file mode 100644
index 0000000..adb53b7
--- /dev/null
+++ b/src/whoosh/analysis/acore.py
@@ -0,0 +1,156 @@
+# Copyright 2007 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+from whoosh.compat import iteritems
+
+
+# Exceptions
+
+class CompositionError(Exception):
+ pass
+
+
+# Utility functions
+
+def unstopped(tokenstream):
+ """Removes tokens from a token stream where token.stopped = True.
+ """
+ return (t for t in tokenstream if not t.stopped)
+
+
+def entoken(textstream, positions=False, chars=False, start_pos=0,
+ start_char=0, **kwargs):
+ """Takes a sequence of unicode strings and yields a series of Token objects
+ (actually the same Token object over and over, for performance reasons),
+ with the attributes filled in with reasonable values (for example, if
+ ``positions`` or ``chars`` is True, the function assumes each token was
+ separated by one space).
+ """
+
+ pos = start_pos
+ char = start_char
+ t = Token(positions=positions, chars=chars, **kwargs)
+
+ for text in textstream:
+ t.text = text
+
+ if positions:
+ t.pos = pos
+ pos += 1
+
+ if chars:
+ t.startchar = char
+ char = char + len(text)
+ t.endchar = char
+
+ yield t
+
+
+# Token object
+
+class Token(object):
+ """
+ Represents a "token" (usually a word) extracted from the source text being
+ indexed.
+
+ See "Advanced analysis" in the user guide for more information.
+
+ Because object instantiation in Python is slow, tokenizers should create
+ ONE SINGLE Token object and YIELD IT OVER AND OVER, changing the attributes
+ each time.
+
+ This trick means that consumers of tokens (i.e. filters) must never try to
+ hold onto the token object between loop iterations, or convert the token
+ generator into a list. Instead, save the attributes between iterations,
+ not the object::
+
+ def RemoveDuplicatesFilter(self, stream):
+ # Removes duplicate words.
+ lasttext = None
+ for token in stream:
+ # Only yield the token if its text doesn't
+ # match the previous token.
+ if lasttext != token.text:
+ yield token
+ lasttext = token.text
+
+ ...or, call token.copy() to get a copy of the token object.
+ """
+
+ def __init__(self, positions=False, chars=False, removestops=True, mode='',
+ **kwargs):
+ """
+ :param positions: Whether tokens should have the token position in the
+ 'pos' attribute.
+ :param chars: Whether tokens should have character offsets in the
+ 'startchar' and 'endchar' attributes.
+ :param removestops: whether to remove stop words from the stream (if
+ the tokens pass through a stop filter).
+ :param mode: contains a string describing the purpose for which the
+ analyzer is being called, i.e. 'index' or 'query'.
+ """
+
+ self.positions = positions
+ self.chars = chars
+ self.stopped = False
+ self.boost = 1.0
+ self.removestops = removestops
+ self.mode = mode
+ self.__dict__.update(kwargs)
+
+ def __repr__(self):
+ parms = ", ".join("%s=%r" % (name, value)
+ for name, value in iteritems(self.__dict__))
+ return "%s(%s)" % (self.__class__.__name__, parms)
+
+ def copy(self):
+ # This is faster than using the copy module
+ return Token(**self.__dict__)
+
+
+# Composition support
+
+class Composable(object):
+ is_morph = False
+
+ def __or__(self, other):
+ from whoosh.analysis.analyzers import CompositeAnalyzer
+
+ if not isinstance(other, Composable):
+ raise TypeError("%r is not composable with %r" % (self, other))
+ return CompositeAnalyzer(self, other)
+
+ def __repr__(self):
+ attrs = ""
+ if self.__dict__:
+ attrs = ", ".join("%s=%r" % (key, value)
+ for key, value
+ in iteritems(self.__dict__))
+ return self.__class__.__name__ + "(%s)" % attrs
+
+ def has_morph(self):
+ return self.is_morph
diff --git a/src/whoosh/analysis/analyzers.py b/src/whoosh/analysis/analyzers.py
new file mode 100644
index 0000000..f7d6e3c
--- /dev/null
+++ b/src/whoosh/analysis/analyzers.py
@@ -0,0 +1,296 @@
+# Copyright 2007 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+from whoosh.analysis.acore import Composable, CompositionError
+from whoosh.analysis.tokenizers import Tokenizer
+from whoosh.analysis.filters import LowercaseFilter
+from whoosh.analysis.filters import StopFilter, STOP_WORDS
+from whoosh.analysis.morph import StemFilter
+from whoosh.analysis.intraword import IntraWordFilter
+from whoosh.analysis.tokenizers import default_pattern
+from whoosh.analysis.tokenizers import CommaSeparatedTokenizer
+from whoosh.analysis.tokenizers import IDTokenizer
+from whoosh.analysis.tokenizers import RegexTokenizer
+from whoosh.analysis.tokenizers import SpaceSeparatedTokenizer
+from whoosh.lang.porter import stem
+
+
+# Analyzers
+
+class Analyzer(Composable):
+ """ Abstract base class for analyzers.
+ """
+
+ def __repr__(self):
+ return "%s()" % self.__class__.__name__
+
+ def __eq__(self, other):
+ return (other
+ and self.__class__ is other.__class__
+ and self.__dict__ == other.__dict__)
+
+ def __call__(self, value, **kwargs):
+ raise NotImplementedError
+
+ def clean(self):
+ pass
+
+
+class CompositeAnalyzer(Analyzer):
+ def __init__(self, *composables):
+ self.items = []
+
+ for comp in composables:
+ if isinstance(comp, CompositeAnalyzer):
+ self.items.extend(comp.items)
+ else:
+ self.items.append(comp)
+
+ # Tokenizers must start a chain, and then only filters after that
+ # (because analyzers take a string and return a generator of tokens,
+ # and filters take and return generators of tokens)
+ for item in self.items[1:]:
+ if isinstance(item, Tokenizer):
+ raise CompositionError("Only one tokenizer allowed at the start"
+ " of the analyzer: %r" % self.items)
+
+ def __repr__(self):
+ return "%s(%s)" % (self.__class__.__name__,
+ ", ".join(repr(item) for item in self.items))
+
+ def __call__(self, value, no_morph=False, **kwargs):
+ items = self.items
+ # Start with tokenizer
+ gen = items[0](value, **kwargs)
+ # Run filters
+ for item in items[1:]:
+ if not (no_morph and hasattr(item, "is_morph") and item.is_morph):
+ gen = item(gen)
+ return gen
+
+ def __getitem__(self, item):
+ return self.items.__getitem__(item)
+
+ def __len__(self):
+ return len(self.items)
+
+ def __eq__(self, other):
+ return (other
+ and self.__class__ is other.__class__
+ and self.items == other.items)
+
+ def clean(self):
+ for item in self.items:
+ if hasattr(item, "clean"):
+ item.clean()
+
+ def has_morph(self):
+ return any(item.is_morph for item in self.items)
+
+
+# Functions that return composed analyzers
+
+def IDAnalyzer(lowercase=False):
+ """Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if
+ desired.
+ """
+
+ tokenizer = IDTokenizer()
+ if lowercase:
+ tokenizer = tokenizer | LowercaseFilter()
+ return tokenizer
+
+
+def KeywordAnalyzer(lowercase=False, commas=False):
+ """Parses whitespace- or comma-separated tokens.
+
+ >>> ana = KeywordAnalyzer()
+ >>> [token.text for token in ana("Hello there, this is a TEST")]
+ ["Hello", "there,", "this", "is", "a", "TEST"]
+
+ :param lowercase: whether to lowercase the tokens.
+ :param commas: if True, items are separated by commas rather than
+ whitespace.
+ """
+
+ if commas:
+ tokenizer = CommaSeparatedTokenizer()
+ else:
+ tokenizer = SpaceSeparatedTokenizer()
+ if lowercase:
+ tokenizer = tokenizer | LowercaseFilter()
+ return tokenizer
+
+
+def RegexAnalyzer(expression=r"\w+(\.?\w+)*", gaps=False):
+ """Deprecated, just use a RegexTokenizer directly.
+ """
+
+ return RegexTokenizer(expression=expression, gaps=gaps)
+
+
+def SimpleAnalyzer(expression=default_pattern, gaps=False):
+ """Composes a RegexTokenizer with a LowercaseFilter.
+
+ >>> ana = SimpleAnalyzer()
+ >>> [token.text for token in ana("Hello there, this is a TEST")]
+ ["hello", "there", "this", "is", "a", "test"]
+
+ :param expression: The regular expression pattern to use to extract tokens.
+ :param gaps: If True, the tokenizer *splits* on the expression, rather
+ than matching on the expression.
+ """
+
+ return RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()
+
+
+def StandardAnalyzer(expression=default_pattern, stoplist=STOP_WORDS,
+ minsize=2, maxsize=None, gaps=False):
+ """Composes a RegexTokenizer with a LowercaseFilter and optional
+ StopFilter.
+
+ >>> ana = StandardAnalyzer()
+ >>> [token.text for token in ana("Testing is testing and testing")]
+ ["testing", "testing", "testing"]
+
+ :param expression: The regular expression pattern to use to extract tokens.
+ :param stoplist: A list of stop words. Set this to None to disable
+ the stop word filter.
+ :param minsize: Words smaller than this are removed from the stream.
+ :param maxsize: Words longer that this are removed from the stream.
+ :param gaps: If True, the tokenizer *splits* on the expression, rather
+ than matching on the expression.
+ """
+
+ ret = RegexTokenizer(expression=expression, gaps=gaps)
+ chain = ret | LowercaseFilter()
+ if stoplist is not None:
+ chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,
+ maxsize=maxsize)
+ return chain
+
+
+def StemmingAnalyzer(expression=default_pattern, stoplist=STOP_WORDS,
+ minsize=2, maxsize=None, gaps=False, stemfn=stem,
+ ignore=None, cachesize=50000):
+ """Composes a RegexTokenizer with a lower case filter, an optional stop
+ filter, and a stemming filter.
+
+ >>> ana = StemmingAnalyzer()
+ >>> [token.text for token in ana("Testing is testing and testing")]
+ ["test", "test", "test"]
+
+ :param expression: The regular expression pattern to use to extract tokens.
+ :param stoplist: A list of stop words. Set this to None to disable
+ the stop word filter.
+ :param minsize: Words smaller than this are removed from the stream.
+ :param maxsize: Words longer that this are removed from the stream.
+ :param gaps: If True, the tokenizer *splits* on the expression, rather
+ than matching on the expression.
+ :param ignore: a set of words to not stem.
+ :param cachesize: the maximum number of stemmed words to cache. The larger
+ this number, the faster stemming will be but the more memory it will
+ use. Use None for no cache, or -1 for an unbounded cache.
+ """
+
+ ret = RegexTokenizer(expression=expression, gaps=gaps)
+ chain = ret | LowercaseFilter()
+ if stoplist is not None:
+ chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,
+ maxsize=maxsize)
+ return chain | StemFilter(stemfn=stemfn, ignore=ignore,
+ cachesize=cachesize)
+
+
+def FancyAnalyzer(expression=r"\s+", stoplist=STOP_WORDS, minsize=2,
+ maxsize=None, gaps=True, splitwords=True, splitnums=True,
+ mergewords=False, mergenums=False):
+ """Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and
+ StopFilter.
+
+ >>> ana = FancyAnalyzer()
+ >>> [token.text for token in ana("Should I call getInt or get_real?")]
+ ["should", "call", "getInt", "get", "int", "get_real", "get", "real"]
+
+ :param expression: The regular expression pattern to use to extract tokens.
+ :param stoplist: A list of stop words. Set this to None to disable
+ the stop word filter.
+ :param minsize: Words smaller than this are removed from the stream.
+ :param maxsize: Words longer that this are removed from the stream.
+ :param gaps: If True, the tokenizer *splits* on the expression, rather
+ than matching on the expression.
+ """
+
+ return (RegexTokenizer(expression=expression, gaps=gaps)
+ | IntraWordFilter(splitwords=splitwords, splitnums=splitnums,
+ mergewords=mergewords, mergenums=mergenums)
+ | LowercaseFilter()
+ | StopFilter(stoplist=stoplist, minsize=minsize)
+ )
+
+
+def LanguageAnalyzer(lang, expression=default_pattern, gaps=False,
+ cachesize=50000):
+ """Configures a simple analyzer for the given language, with a
+ LowercaseFilter, StopFilter, and StemFilter.
+
+ >>> ana = LanguageAnalyzer("es")
+ >>> [token.text for token in ana("Por el mar corren las liebres")]
+ ['mar', 'corr', 'liebr']
+
+ The list of available languages is in `whoosh.lang.languages`.
+ You can use :func:`whoosh.lang.has_stemmer` and
+ :func:`whoosh.lang.has_stopwords` to check if a given language has a
+ stemming function and/or stop word list available.
+
+ :param expression: The regular expression pattern to use to extract tokens.
+ :param gaps: If True, the tokenizer *splits* on the expression, rather
+ than matching on the expression.
+ :param cachesize: the maximum number of stemmed words to cache. The larger
+ this number, the faster stemming will be but the more memory it will
+ use.
+ """
+
+ from whoosh.lang import NoStemmer, NoStopWords
+
+ # Make the start of the chain
+ chain = (RegexTokenizer(expression=expression, gaps=gaps)
+ | LowercaseFilter())
+
+ # Add a stop word filter
+ try:
+ chain = chain | StopFilter(lang=lang)
+ except NoStopWords:
+ pass
+
+ # Add a stemming filter
+ try:
+ chain = chain | StemFilter(lang=lang, cachesize=cachesize)
+ except NoStemmer:
+ pass
+
+ return chain
diff --git a/src/whoosh/analysis/filters.py b/src/whoosh/analysis/filters.py
new file mode 100644
index 0000000..add9c98
--- /dev/null
+++ b/src/whoosh/analysis/filters.py
@@ -0,0 +1,479 @@
+# coding=utf-8
+
+# Copyright 2007 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+from itertools import chain
+
+from whoosh.compat import next, xrange
+from whoosh.analysis.acore import Composable
+from whoosh.util.text import rcompile
+
+
+# Default list of stop words (words so common it's usually wasteful to index
+# them). This list is used by the StopFilter class, which allows you to supply
+# an optional list to override this one.
+
+STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
+ 'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
+ 'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
+ 'to', 'us', 'we', 'when', 'will', 'with', 'yet',
+ 'you', 'your'))
+
+
+# Simple pattern for filtering URLs, may be useful
+
+url_pattern = rcompile("""
+(
+ [A-Za-z+]+:// # URL protocol
+ \\S+? # URL body
+ (?=\\s|[.]\\s|$|[.]$) # Stop at space/end, or a dot followed by space/end
+) | ( # or...
+ \w+([:.]?\w+)* # word characters, with opt. internal colons/dots
+)
+""", verbose=True)
+
+
+# Filters
+
+class Filter(Composable):
+ """Base class for Filter objects. A Filter subclass must implement a
+ filter() method that takes a single argument, which is an iterator of Token
+ objects, and yield a series of Token objects in return.
+
+ Filters that do morphological transformation of tokens (e.g. stemming)
+ should set their ``is_morph`` attribute to True.
+ """
+
+ def __eq__(self, other):
+ return (other
+ and self.__class__ is other.__class__
+ and self.__dict__ == other.__dict__)
+
+ def __ne__(self, other):
+ return not self == other
+
+ def __call__(self, tokens):
+ raise NotImplementedError
+
+
+class PassFilter(Filter):
+ """An identity filter: passes the tokens through untouched.
+ """
+
+ def __call__(self, tokens):
+ return tokens
+
+
+class LoggingFilter(Filter):
+ """Prints the contents of every filter that passes through as a debug
+ log entry.
+ """
+
+ def __init__(self, logger=None):
+ """
+ :param target: the logger to use. If omitted, the "whoosh.analysis"
+ logger is used.
+ """
+
+ if logger is None:
+ import logging
+ logger = logging.getLogger("whoosh.analysis")
+ self.logger = logger
+
+ def __call__(self, tokens):
+ logger = self.logger
+ for t in tokens:
+ logger.debug(repr(t))
+ yield t
+
+
+class MultiFilter(Filter):
+ """Chooses one of two or more sub-filters based on the 'mode' attribute
+ of the token stream.
+ """
+
+ default_filter = PassFilter()
+
+ def __init__(self, **kwargs):
+ """Use keyword arguments to associate mode attribute values with
+ instantiated filters.
+
+ >>> iwf_for_index = IntraWordFilter(mergewords=True, mergenums=False)
+ >>> iwf_for_query = IntraWordFilter(mergewords=False, mergenums=False)
+ >>> mf = MultiFilter(index=iwf_for_index, query=iwf_for_query)
+
+ This class expects that the value of the mode attribute is consistent
+ among all tokens in a token stream.
+ """
+ self.filters = kwargs
+
+ def __eq__(self, other):
+ return (other
+ and self.__class__ is other.__class__
+ and self.filters == other.filters)
+
+ def __call__(self, tokens):
+ # Only selects on the first token
+ t = next(tokens)
+ filter = self.filters.get(t.mode, self.default_filter)
+ return filter(chain([t], tokens))
+
+
+class TeeFilter(Filter):
+ """Interleaves the results of two or more filters (or filter chains).
+
+ NOTE: because it needs to create copies of each token for each sub-filter,
+ this filter is quite slow.
+
+ >>> target = "ALFA BRAVO CHARLIE"
+ >>> # In one branch, we'll lower-case the tokens
+ >>> f1 = LowercaseFilter()
+ >>> # In the other branch, we'll reverse the tokens
+ >>> f2 = ReverseTextFilter()
+ >>> ana = RegexTokenizer(r"\S+") | TeeFilter(f1, f2)
+ >>> [token.text for token in ana(target)]
+ ["alfa", "AFLA", "bravo", "OVARB", "charlie", "EILRAHC"]
+
+ To combine the incoming token stream with the output of a filter chain, use
+ ``TeeFilter`` and make one of the filters a :class:`PassFilter`.
+
+ >>> f1 = PassFilter()
+ >>> f2 = BiWordFilter()
+ >>> ana = RegexTokenizer(r"\S+") | TeeFilter(f1, f2) | LowercaseFilter()
+ >>> [token.text for token in ana(target)]
+ ["alfa", "alfa-bravo", "bravo", "bravo-charlie", "charlie"]
+ """
+
+ def __init__(self, *filters):
+ if len(filters) < 2:
+ raise Exception("TeeFilter requires two or more filters")
+ self.filters = filters
+
+ def __eq__(self, other):
+ return (self.__class__ is other.__class__
+ and self.filters == other.fitlers)
+
+ def __call__(self, tokens):
+ from itertools import tee
+
+ count = len(self.filters)
+ # Tee the token iterator and wrap each teed iterator with the
+ # corresponding filter
+ gens = [filter(t.copy() for t in gen) for filter, gen
+ in zip(self.filters, tee(tokens, count))]
+ # Keep a count of the number of running iterators
+ running = count
+ while running:
+ for i, gen in enumerate(gens):
+ if gen is not None:
+ try:
+ yield next(gen)
+ except StopIteration:
+ gens[i] = None
+ running -= 1
+
+
+class ReverseTextFilter(Filter):
+ """Reverses the text of each token.
+
+ >>> ana = RegexTokenizer() | ReverseTextFilter()
+ >>> [token.text for token in ana("hello there")]
+ ["olleh", "ereht"]
+ """
+
+ def __call__(self, tokens):
+ for t in tokens:
+ t.text = t.text[::-1]
+ yield t
+
+
+class LowercaseFilter(Filter):
+ """Uses unicode.lower() to lowercase token text.
+
+ >>> rext = RegexTokenizer()
+ >>> stream = rext("This is a TEST")
+ >>> [token.text for token in LowercaseFilter(stream)]
+ ["this", "is", "a", "test"]
+ """
+
+ def __call__(self, tokens):
+ for t in tokens:
+ t.text = t.text.lower()
+ yield t
+
+
+class StripFilter(Filter):
+ """Calls unicode.strip() on the token text.
+ """
+
+ def __call__(self, tokens):
+ for t in tokens:
+ t.text = t.text.strip()
+ yield t
+
+
+class StopFilter(Filter):
+ """Marks "stop" words (words too common to index) in the stream (and by
+ default removes them).
+
+ Make sure you precede this filter with a :class:`LowercaseFilter`.
+
+ >>> stopper = RegexTokenizer() | StopFilter()
+ >>> [token.text for token in stopper(u"this is a test")]
+ ["test"]
+ >>> es_stopper = RegexTokenizer() | StopFilter(lang="es")
+ >>> [token.text for token in es_stopper(u"el lapiz es en la mesa")]
+ ["lapiz", "mesa"]
+
+ The list of available languages is in `whoosh.lang.languages`.
+ You can use :func:`whoosh.lang.has_stopwords` to check if a given language
+ has a stop word list available.
+ """
+
+ def __init__(self, stoplist=STOP_WORDS, minsize=2, maxsize=None,
+ renumber=True, lang=None):
+ """
+ :param stoplist: A collection of words to remove from the stream.
+ This is converted to a frozenset. The default is a list of
+ common English stop words.
+ :param minsize: The minimum length of token texts. Tokens with
+ text smaller than this will be stopped. The default is 2.
+ :param maxsize: The maximum length of token texts. Tokens with text
+ larger than this will be stopped. Use None to allow any length.
+ :param renumber: Change the 'pos' attribute of unstopped tokens
+ to reflect their position with the stopped words removed.
+ :param lang: Automatically get a list of stop words for the given
+ language
+ """
+
+ stops = set()
+ if stoplist:
+ stops.update(stoplist)
+ if lang:
+ from whoosh.lang import stopwords_for_language
+
+ stops.update(stopwords_for_language(lang))
+
+ self.stops = frozenset(stops)
+ self.min = minsize
+ self.max = maxsize
+ self.renumber = renumber
+
+ def __eq__(self, other):
+ return (other
+ and self.__class__ is other.__class__
+ and self.stops == other.stops
+ and self.min == other.min
+ and self.renumber == other.renumber)
+
+ def __call__(self, tokens):
+ stoplist = self.stops
+ minsize = self.min
+ maxsize = self.max
+ renumber = self.renumber
+
+ pos = None
+ for t in tokens:
+ text = t.text
+ if (len(text) >= minsize
+ and (maxsize is None or len(text) <= maxsize)
+ and text not in stoplist):
+ # This is not a stop word
+ if renumber and t.positions:
+ if pos is None:
+ pos = t.pos
+ else:
+ pos += 1
+ t.pos = pos
+ t.stopped = False
+ yield t
+ else:
+ # This is a stop word
+ if not t.removestops:
+ # This IS a stop word, but we're not removing them
+ t.stopped = True
+ yield t
+
+
+class CharsetFilter(Filter):
+ """Translates the text of tokens by calling unicode.translate() using the
+ supplied character mapping object. This is useful for case and accent
+ folding.
+
+ The ``whoosh.support.charset`` module has a useful map for accent folding.
+
+ >>> from whoosh.support.charset import accent_map
+ >>> retokenizer = RegexTokenizer()
+ >>> chfilter = CharsetFilter(accent_map)
+ >>> [t.text for t in chfilter(retokenizer(u'café'))]
+ [u'cafe']
+
+ Another way to get a character mapping object is to convert a Sphinx
+ charset table file using
+ :func:`whoosh.support.charset.charset_table_to_dict`.
+
+ >>> from whoosh.support.charset import charset_table_to_dict
+ >>> from whoosh.support.charset import default_charset
+ >>> retokenizer = RegexTokenizer()
+ >>> charmap = charset_table_to_dict(default_charset)
+ >>> chfilter = CharsetFilter(charmap)
+ >>> [t.text for t in chfilter(retokenizer(u'Stra\\xdfe'))]
+ [u'strase']
+
+ The Sphinx charset table format is described at
+ http://www.sphinxsearch.com/docs/current.html#conf-charset-table.
+ """
+
+ __inittypes__ = dict(charmap=dict)
+
+ def __init__(self, charmap):
+ """
+ :param charmap: a dictionary mapping from integer character numbers to
+ unicode characters, as required by the unicode.translate() method.
+ """
+
+ self.charmap = charmap
+
+ def __eq__(self, other):
+ return (other
+ and self.__class__ is other.__class__
+ and self.charmap == other.charmap)
+
+ def __call__(self, tokens):
+ assert hasattr(tokens, "__iter__")
+ charmap = self.charmap
+ for t in tokens:
+ t.text = t.text.translate(charmap)
+ yield t
+
+
+class DelimitedAttributeFilter(Filter):
+ """Looks for delimiter characters in the text of each token and stores the
+ data after the delimiter in a named attribute on the token.
+
+ The defaults are set up to use the ``^`` character as a delimiter and store
+ the value after the ``^`` as the boost for the token.
+
+ >>> daf = DelimitedAttributeFilter(delimiter="^", attribute="boost")
+ >>> ana = RegexTokenizer("\\\\S+") | DelimitedAttributeFilter()
+ >>> for t in ana(u("image render^2 file^0.5"))
+ ... print("%r %f" % (t.text, t.boost))
+ 'image' 1.0
+ 'render' 2.0
+ 'file' 0.5
+
+ Note that you need to make sure your tokenizer includes the delimiter and
+ data as part of the token!
+ """
+
+ def __init__(self, delimiter="^", attribute="boost", default=1.0,
+ type=float):
+ """
+ :param delimiter: a string that, when present in a token's text,
+ separates the actual text from the "data" payload.
+ :param attribute: the name of the attribute in which to store the
+ data on the token.
+ :param default: the value to use for the attribute for tokens that
+ don't have delimited data.
+ :param type: the type of the data, for example ``str`` or ``float``.
+ This is used to convert the string value of the data before
+ storing it in the attribute.
+ """
+
+ self.delim = delimiter
+ self.attr = attribute
+ self.default = default
+ self.type = type
+
+ def __eq__(self, other):
+ return (other and self.__class__ is other.__class__
+ and self.delim == other.delim
+ and self.attr == other.attr
+ and self.default == other.default)
+
+ def __call__(self, tokens):
+ delim = self.delim
+ attr = self.attr
+ default = self.default
+ type_ = self.type
+
+ for t in tokens:
+ text = t.text
+ pos = text.find(delim)
+ if pos > -1:
+ setattr(t, attr, type_(text[pos + 1:]))
+ if t.chars:
+ t.endchar -= len(t.text) - pos
+ t.text = text[:pos]
+ else:
+ setattr(t, attr, default)
+
+ yield t
+
+
+class SubstitutionFilter(Filter):
+ """Performs a regular expression substitution on the token text.
+
+ This is especially useful for removing text from tokens, for example
+ hyphens::
+
+ ana = RegexTokenizer(r"\\S+") | SubstitutionFilter("-", "")
+
+ Because it has the full power of the re.sub() method behind it, this filter
+ can perform some fairly complex transformations. For example, to take
+ tokens like ``'a=b', 'c=d', 'e=f'`` and change them to ``'b=a', 'd=c',
+ 'f=e'``::
+
+ # Analyzer that swaps the text on either side of an equal sign
+ rt = RegexTokenizer(r"\\S+")
+ sf = SubstitutionFilter("([^/]*)/(./*)", r"\\2/\\1")
+ ana = rt | sf
+ """
+
+ def __init__(self, pattern, replacement):
+ """
+ :param pattern: a pattern string or compiled regular expression object
+ describing the text to replace.
+ :param replacement: the substitution text.
+ """
+
+ self.pattern = rcompile(pattern)
+ self.replacement = replacement
+
+ def __eq__(self, other):
+ return (other and self.__class__ is other.__class__
+ and self.pattern == other.pattern
+ and self.replacement == other.replacement)
+
+ def __call__(self, tokens):
+ pattern = self.pattern
+ replacement = self.replacement
+
+ for t in tokens:
+ t.text = pattern.sub(replacement, t.text)
+ yield t
diff --git a/src/whoosh/analysis/intraword.py b/src/whoosh/analysis/intraword.py
new file mode 100644
index 0000000..601423e
--- /dev/null
+++ b/src/whoosh/analysis/intraword.py
@@ -0,0 +1,494 @@
+# Copyright 2007 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+import re
+from collections import deque
+
+from whoosh.compat import u, text_type
+from whoosh.compat import xrange
+from whoosh.analysis.filters import Filter
+
+
+class CompoundWordFilter(Filter):
+ """Given a set of words (or any object with a ``__contains__`` method),
+ break any tokens in the stream that are composites of words in the word set
+ into their individual parts.
+
+ Given the correct set of words, this filter can break apart run-together
+ words and trademarks (e.g. "turbosquid", "applescript"). It can also be
+ useful for agglutinative languages such as German.
+
+ The ``keep_compound`` argument lets you decide whether to keep the
+ compound word in the token stream along with the word segments.
+
+ >>> cwf = CompoundWordFilter(wordset, keep_compound=True)
+ >>> analyzer = RegexTokenizer(r"\S+") | cwf
+ >>> [t.text for t in analyzer("I do not like greeneggs and ham")
+ ["I", "do", "not", "like", "greeneggs", "green", "eggs", "and", "ham"]
+ >>> cwf.keep_compound = False
+ >>> [t.text for t in analyzer("I do not like greeneggs and ham")
+ ["I", "do", "not", "like", "green", "eggs", "and", "ham"]
+ """
+
+ def __init__(self, wordset, keep_compound=True):
+ """
+ :param wordset: an object with a ``__contains__`` method, such as a
+ set, containing strings to look for inside the tokens.
+ :param keep_compound: if True (the default), the original compound
+ token will be retained in the stream before the subwords.
+ """
+
+ self.wordset = wordset
+ self.keep_compound = keep_compound
+
+ def subwords(self, s, memo):
+ if s in self.wordset:
+ return [s]
+ if s in memo:
+ return memo[s]
+
+ for i in xrange(1, len(s)):
+ prefix = s[:i]
+ if prefix in self.wordset:
+ suffix = s[i:]
+ suffix_subs = self.subwords(suffix, memo)
+ if suffix_subs:
+ result = [prefix] + suffix_subs
+ memo[s] = result
+ return result
+
+ return None
+
+ def __call__(self, tokens):
+ keep_compound = self.keep_compound
+ memo = {}
+ subwords = self.subwords
+ for t in tokens:
+ subs = subwords(t.text, memo)
+ if subs:
+ if len(subs) > 1 and keep_compound:
+ yield t
+ for subword in subs:
+ t.text = subword
+ yield t
+ else:
+ yield t
+
+
+class BiWordFilter(Filter):
+ """Merges adjacent tokens into "bi-word" tokens, so that for example::
+
+ "the", "sign", "of", "four"
+
+ becomes::
+
+ "the-sign", "sign-of", "of-four"
+
+ This can be used to create fields for pseudo-phrase searching, where if
+ all the terms match the document probably contains the phrase, but the
+ searching is faster than actually doing a phrase search on individual word
+ terms.
+
+ The ``BiWordFilter`` is much faster than using the otherwise equivalent
+ ``ShingleFilter(2)``.
+ """
+
+ def __init__(self, sep="-"):
+ self.sep = sep
+
+ def __call__(self, tokens):
+ sep = self.sep
+ prev_text = None
+ prev_startchar = None
+ prev_pos = None
+ atleastone = False
+
+ for token in tokens:
+ # Save the original text of this token
+ text = token.text
+
+ # Save the original position
+ positions = token.positions
+ if positions:
+ ps = token.pos
+
+ # Save the original start char
+ chars = token.chars
+ if chars:
+ sc = token.startchar
+
+ if prev_text is not None:
+ # Use the pos and startchar from the previous token
+ if positions:
+ token.pos = prev_pos
+ if chars:
+ token.startchar = prev_startchar
+
+ # Join the previous token text and the current token text to
+ # form the biword token
+ token.text = "".join((prev_text, sep, text))
+ yield token
+ atleastone = True
+
+ # Save the originals and the new "previous" values
+ prev_text = text
+ if chars:
+ prev_startchar = sc
+ if positions:
+ prev_pos = ps
+
+ # If no bi-words were emitted, that is, the token stream only had
+ # a single token, then emit that single token.
+ if not atleastone:
+ yield token
+
+
+class ShingleFilter(Filter):
+ """Merges a certain number of adjacent tokens into multi-word tokens, so
+ that for example::
+
+ "better", "a", "witty", "fool", "than", "a", "foolish", "wit"
+
+ with ``ShingleFilter(3, ' ')`` becomes::
+
+ 'better a witty', 'a witty fool', 'witty fool than', 'fool than a',
+ 'than a foolish', 'a foolish wit'
+
+ This can be used to create fields for pseudo-phrase searching, where if
+ all the terms match the document probably contains the phrase, but the
+ searching is faster than actually doing a phrase search on individual word
+ terms.
+
+ If you're using two-word shingles, you should use the functionally
+ equivalent ``BiWordFilter`` instead because it's faster than
+ ``ShingleFilter``.
+ """
+
+ def __init__(self, size=2, sep="-"):
+ self.size = size
+ self.sep = sep
+
+ def __call__(self, tokens):
+ size = self.size
+ sep = self.sep
+ buf = deque()
+ atleastone = False
+
+ def make_token():
+ tk = buf[0]
+ tk.text = sep.join([t.text for t in buf])
+ if tk.chars:
+ tk.endchar = buf[-1].endchar
+ return tk
+
+ for token in tokens:
+ if not token.stopped:
+ buf.append(token.copy())
+ if len(buf) == size:
+ atleastone = True
+ yield make_token()
+ buf.popleft()
+
+ # If no shingles were emitted, that is, the token stream had fewer than
+ # 'size' tokens, then emit a single token with whatever tokens there
+ # were
+ if not atleastone and buf:
+ yield make_token()
+
+
+class IntraWordFilter(Filter):
+ """Splits words into subwords and performs optional transformations on
+ subword groups. This filter is funtionally based on yonik's
+ WordDelimiterFilter in Solr, but shares no code with it.
+
+ * Split on intra-word delimiters, e.g. `Wi-Fi` -> `Wi`, `Fi`.
+ * When splitwords=True, split on case transitions,
+ e.g. `PowerShot` -> `Power`, `Shot`.
+ * When splitnums=True, split on letter-number transitions,
+ e.g. `SD500` -> `SD`, `500`.
+ * Leading and trailing delimiter characters are ignored.
+ * Trailing possesive "'s" removed from subwords,
+ e.g. `O'Neil's` -> `O`, `Neil`.
+
+ The mergewords and mergenums arguments turn on merging of subwords.
+
+ When the merge arguments are false, subwords are not merged.
+
+ * `PowerShot` -> `0`:`Power`, `1`:`Shot` (where `0` and `1` are token
+ positions).
+
+ When one or both of the merge arguments are true, consecutive runs of
+ alphabetic and/or numeric subwords are merged into an additional token with
+ the same position as the last sub-word.
+
+ * `PowerShot` -> `0`:`Power`, `1`:`Shot`, `1`:`PowerShot`
+ * `A's+B's&C's` -> `0`:`A`, `1`:`B`, `2`:`C`, `2`:`ABC`
+ * `Super-Duper-XL500-42-AutoCoder!` -> `0`:`Super`, `1`:`Duper`, `2`:`XL`,
+ `2`:`SuperDuperXL`,
+ `3`:`500`, `4`:`42`, `4`:`50042`, `5`:`Auto`, `6`:`Coder`,
+ `6`:`AutoCoder`
+
+ When using this filter you should use a tokenizer that only splits on
+ whitespace, so the tokenizer does not remove intra-word delimiters before
+ this filter can see them, and put this filter before any use of
+ LowercaseFilter.
+
+ >>> rt = RegexTokenizer(r"\\S+")
+ >>> iwf = IntraWordFilter()
+ >>> lcf = LowercaseFilter()
+ >>> analyzer = rt | iwf | lcf
+
+ One use for this filter is to help match different written representations
+ of a concept. For example, if the source text contained `wi-fi`, you
+ probably want `wifi`, `WiFi`, `wi-fi`, etc. to match. One way of doing this
+ is to specify mergewords=True and/or mergenums=True in the analyzer used
+ for indexing, and mergewords=False / mergenums=False in the analyzer used
+ for querying.
+
+ >>> iwf_i = IntraWordFilter(mergewords=True, mergenums=True)
+ >>> iwf_q = IntraWordFilter(mergewords=False, mergenums=False)
+ >>> iwf = MultiFilter(index=iwf_i, query=iwf_q)
+ >>> analyzer = RegexTokenizer(r"\S+") | iwf | LowercaseFilter()
+
+ (See :class:`MultiFilter`.)
+ """
+
+ is_morph = True
+
+ __inittypes__ = dict(delims=text_type, splitwords=bool, splitnums=bool,
+ mergewords=bool, mergenums=bool)
+
+ def __init__(self, delims=u("-_'\"()!@#$%^&*[]{}<>\|;:,./?`~=+"),
+ splitwords=True, splitnums=True,
+ mergewords=False, mergenums=False):
+ """
+ :param delims: a string of delimiter characters.
+ :param splitwords: if True, split at case transitions,
+ e.g. `PowerShot` -> `Power`, `Shot`
+ :param splitnums: if True, split at letter-number transitions,
+ e.g. `SD500` -> `SD`, `500`
+ :param mergewords: merge consecutive runs of alphabetic subwords into
+ an additional token with the same position as the last subword.
+ :param mergenums: merge consecutive runs of numeric subwords into an
+ additional token with the same position as the last subword.
+ """
+
+ from whoosh.support.unicode import digits, lowercase, uppercase
+
+ self.delims = re.escape(delims)
+
+ # Expression for text between delimiter characters
+ self.between = re.compile(u("[^%s]+") % (self.delims,), re.UNICODE)
+ # Expression for removing "'s" from the end of sub-words
+ dispat = u("(?<=[%s%s])'[Ss](?=$|[%s])") % (lowercase, uppercase,
+ self.delims)
+ self.possessive = re.compile(dispat, re.UNICODE)
+
+ # Expression for finding case and letter-number transitions
+ lower2upper = u("[%s][%s]") % (lowercase, uppercase)
+ letter2digit = u("[%s%s][%s]") % (lowercase, uppercase, digits)
+ digit2letter = u("[%s][%s%s]") % (digits, lowercase, uppercase)
+ if splitwords and splitnums:
+ splitpat = u("(%s|%s|%s)") % (lower2upper, letter2digit,
+ digit2letter)
+ self.boundary = re.compile(splitpat, re.UNICODE)
+ elif splitwords:
+ self.boundary = re.compile(text_type(lower2upper), re.UNICODE)
+ elif splitnums:
+ numpat = u("(%s|%s)") % (letter2digit, digit2letter)
+ self.boundary = re.compile(numpat, re.UNICODE)
+
+ self.splitting = splitwords or splitnums
+ self.mergewords = mergewords
+ self.mergenums = mergenums
+
+ def __eq__(self, other):
+ return other and self.__class__ is other.__class__\
+ and self.__dict__ == other.__dict__
+
+ def _split(self, string):
+ bound = self.boundary
+
+ # Yields (startchar, endchar) pairs for each indexable substring in
+ # the given string, e.g. "WikiWord" -> (0, 4), (4, 8)
+
+ # Whether we're splitting on transitions (case changes, letter -> num,
+ # num -> letter, etc.)
+ splitting = self.splitting
+
+ # Make a list (dispos, for "dispossessed") of (startchar, endchar)
+ # pairs for runs of text between "'s"
+ if "'" in string:
+ # Split on possessive 's
+ dispos = []
+ prev = 0
+ for match in self.possessive.finditer(string):
+ dispos.append((prev, match.start()))
+ prev = match.end()
+ if prev < len(string):
+ dispos.append((prev, len(string)))
+ else:
+ # Shortcut if there's no apostrophe in the string
+ dispos = ((0, len(string)),)
+
+ # For each run between 's
+ for sc, ec in dispos:
+ # Split on boundary characters
+ for part_match in self.between.finditer(string, sc, ec):
+ part_start = part_match.start()
+ part_end = part_match.end()
+
+ if splitting:
+ # The point to start splitting at
+ prev = part_start
+ # Find transitions (e.g. "iW" or "a0")
+ for bmatch in bound.finditer(string, part_start, part_end):
+ # The point in the middle of the transition
+ pivot = bmatch.start() + 1
+ # Yield from the previous match to the transition
+ yield (prev, pivot)
+ # Make the transition the new starting point
+ prev = pivot
+
+ # If there's leftover text at the end, yield it too
+ if prev < part_end:
+ yield (prev, part_end)
+ else:
+ # Not splitting on transitions, just yield the part
+ yield (part_start, part_end)
+
+ def _merge(self, parts):
+ mergewords = self.mergewords
+ mergenums = self.mergenums
+
+ # Current type (1=alpah, 2=digit)
+ last = 0
+ # Where to insert a merged term in the original list
+ insertat = 0
+ # Buffer for parts to merge
+ buf = []
+ # Iterate on a copy of the parts list so we can modify the original as
+ # we go
+
+ def insert_item(buf, at, newpos):
+ newtext = "".join(item[0] for item in buf)
+ newsc = buf[0][2] # start char of first item in buffer
+ newec = buf[-1][3] # end char of last item in buffer
+ parts.insert(insertat, (newtext, newpos, newsc, newec))
+
+ for item in list(parts):
+ # item = (text, pos, startchar, endchar)
+ text = item[0]
+ pos = item[1]
+
+ # Set the type of this part
+ if text.isalpha():
+ this = 1
+ elif text.isdigit():
+ this = 2
+ else:
+ this = None
+
+ # Is this the same type as the previous part?
+ if (buf and (this == last == 1 and mergewords)
+ or (this == last == 2 and mergenums)):
+ # This part is the same type as the previous. Add it to the
+ # buffer of parts to merge.
+ buf.append(item)
+ else:
+ # This part is different than the previous.
+ if len(buf) > 1:
+ # If the buffer has at least two parts in it, merge them
+ # and add them to the original list of parts.
+ insert_item(buf, insertat, pos - 1)
+ insertat += 1
+ # Reset the buffer
+ buf = [item]
+ last = this
+ insertat += 1
+
+ # If there are parts left in the buffer at the end, merge them and add
+ # them to the original list.
+ if len(buf) > 1:
+ insert_item(buf, len(parts), pos)
+
+ def __call__(self, tokens):
+ mergewords = self.mergewords
+ mergenums = self.mergenums
+
+ # This filter renumbers tokens as it expands them. New position
+ # counter.
+ newpos = None
+ for t in tokens:
+ text = t.text
+
+ # If this is the first token we've seen, use it to set the new
+ # position counter
+ if newpos is None:
+ if t.positions:
+ newpos = t.pos
+ else:
+ # Token doesn't have positions, just use 0
+ newpos = 0
+
+ if ((text.isalpha() and (text.islower() or text.isupper()))
+ or text.isdigit()):
+ # Short-circuit the common cases of no delimiters, no case
+ # transitions, only digits, etc.
+ t.pos = newpos
+ yield t
+ newpos += 1
+ else:
+ # Split the token text on delimiters, word and/or number
+ # boundaries into a list of (text, pos, startchar, endchar)
+ # tuples
+ ranges = self._split(text)
+ parts = [(text[sc:ec], i + newpos, sc, ec)
+ for i, (sc, ec) in enumerate(ranges)]
+
+ # Did the split yield more than one part?
+ if len(parts) > 1:
+ # If the options are set, merge consecutive runs of all-
+ # letters and/or all-numbers.
+ if mergewords or mergenums:
+ self._merge(parts)
+
+ # Yield tokens for the parts
+ chars = t.chars
+ if chars:
+ base = t.startchar
+ for text, pos, startchar, endchar in parts:
+ t.text = text
+ t.pos = pos
+ if t.chars:
+ t.startchar = base + startchar
+ t.endchar = base + endchar
+ yield t
+
+ if parts:
+ # Set the new position counter based on the last part
+ newpos = parts[-1][1] + 1
diff --git a/src/whoosh/analysis/morph.py b/src/whoosh/analysis/morph.py
new file mode 100644
index 0000000..b7d644f
--- /dev/null
+++ b/src/whoosh/analysis/morph.py
@@ -0,0 +1,267 @@
+# Copyright 2007 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+from whoosh.analysis.filters import Filter
+from whoosh.compat import integer_types
+from whoosh.lang.dmetaphone import double_metaphone
+from whoosh.lang.porter import stem
+from whoosh.util.cache import lfu_cache, unbound_cache
+
+
+class StemFilter(Filter):
+ """Stems (removes suffixes from) the text of tokens using the Porter
+ stemming algorithm. Stemming attempts to reduce multiple forms of the same
+ root word (for example, "rendering", "renders", "rendered", etc.) to a
+ single word in the index.
+
+ >>> stemmer = RegexTokenizer() | StemFilter()
+ >>> [token.text for token in stemmer("fundamentally willows")]
+ ["fundament", "willow"]
+
+ You can pass your own stemming function to the StemFilter. The default
+ is the Porter stemming algorithm for English.
+
+ >>> stemfilter = StemFilter(stem_function)
+
+ You can also use one of the Snowball stemming functions by passing the
+ `lang` keyword argument.
+
+ >>> stemfilter = StemFilter(lang="ru")
+
+ The list of available languages is in `whoosh.lang.languages`.
+ You can use :func:`whoosh.lang.has_stemmer` to check if a given language has
+ a stemming function available.
+
+ By default, this class wraps an LRU cache around the stemming function. The
+ ``cachesize`` keyword argument sets the size of the cache. To make the
+ cache unbounded (the class caches every input), use ``cachesize=-1``. To
+ disable caching, use ``cachesize=None``.
+
+ If you compile and install the py-stemmer library, the
+ :class:`PyStemmerFilter` provides slightly easier access to the language
+ stemmers in that library.
+ """
+
+ __inittypes__ = dict(stemfn=object, ignore=list)
+
+ is_morph = True
+
+ def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000):
+ """
+ :param stemfn: the function to use for stemming.
+ :param lang: if not None, overrides the stemfn with a language stemmer
+ from the ``whoosh.lang.snowball`` package.
+ :param ignore: a set/list of words that should not be stemmed. This is
+ converted into a frozenset. If you omit this argument, all tokens
+ are stemmed.
+ :param cachesize: the maximum number of words to cache. Use ``-1`` for
+ an unbounded cache, or ``None`` for no caching.
+ """
+
+ self.stemfn = stemfn
+ self.lang = lang
+ self.ignore = frozenset() if ignore is None else frozenset(ignore)
+ self.cachesize = cachesize
+ # clear() sets the _stem attr to a cached wrapper around self.stemfn
+ self.clear()
+
+ def __getstate__(self):
+ # Can't pickle a dynamic function, so we have to remove the _stem
+ # attribute from the state
+ return dict([(k, self.__dict__[k]) for k in self.__dict__
+ if k != "_stem"])
+
+ def __setstate__(self, state):
+ # Check for old instances of StemFilter class, which didn't have a
+ # cachesize attribute and pickled the cache attribute
+ if "cachesize" not in state:
+ self.cachesize = 50000
+ if "ignores" in state:
+ self.ignore = state["ignores"]
+ elif "ignore" not in state:
+ self.ignore = frozenset()
+ if "lang" not in state:
+ self.lang = None
+ if "cache" in state:
+ del state["cache"]
+
+ self.__dict__.update(state)
+ # Set the _stem attribute
+ self.clear()
+
+ def clear(self):
+ if self.lang:
+ from whoosh.lang import stemmer_for_language
+ stemfn = stemmer_for_language(self.lang)
+ else:
+ stemfn = self.stemfn
+
+ if isinstance(self.cachesize, integer_types) and self.cachesize != 0:
+ if self.cachesize < 0:
+ self._stem = unbound_cache(stemfn)
+ elif self.cachesize > 1:
+ self._stem = lfu_cache(self.cachesize)(stemfn)
+ else:
+ self._stem = stemfn
+
+ def cache_info(self):
+ if self.cachesize <= 1:
+ return None
+ return self._stem.cache_info()
+
+ def __eq__(self, other):
+ return (other and self.__class__ is other.__class__
+ and self.stemfn == other.stemfn)
+
+ def __call__(self, tokens):
+ stemfn = self._stem
+ ignore = self.ignore
+
+ for t in tokens:
+ if not t.stopped:
+ text = t.text
+ if text not in ignore:
+ t.text = stemfn(text)
+ yield t
+
+
+class PyStemmerFilter(StemFilter):
+ """This is a simple subclass of StemFilter that works with the py-stemmer
+ third-party library. You must have the py-stemmer library installed to use
+ this filter.
+
+ >>> PyStemmerFilter("spanish")
+ """
+
+ def __init__(self, lang="english", ignore=None, cachesize=10000):
+ """
+ :param lang: a string identifying the stemming algorithm to use. You
+ can get a list of available algorithms by with the
+ :meth:`PyStemmerFilter.algorithms` method. The identification
+ strings are directly from the py-stemmer library.
+ :param ignore: a set/list of words that should not be stemmed. This is
+ converted into a frozenset. If you omit this argument, all tokens
+ are stemmed.
+ :param cachesize: the maximum number of words to cache.
+ """
+
+ self.lang = lang
+ self.ignore = frozenset() if ignore is None else frozenset(ignore)
+ self.cachesize = cachesize
+ self._stem = self._get_stemmer_fn()
+
+ def algorithms(self):
+ """Returns a list of stemming algorithms provided by the py-stemmer
+ library.
+ """
+
+ import Stemmer # @UnresolvedImport
+
+ return Stemmer.algorithms()
+
+ def cache_info(self):
+ return None
+
+ def _get_stemmer_fn(self):
+ import Stemmer # @UnresolvedImport
+
+ stemmer = Stemmer.Stemmer(self.lang)
+ stemmer.maxCacheSize = self.cachesize
+ return stemmer.stemWord
+
+ def __getstate__(self):
+ # Can't pickle a dynamic function, so we have to remove the _stem
+ # attribute from the state
+ return dict([(k, self.__dict__[k]) for k in self.__dict__
+ if k != "_stem"])
+
+ def __setstate__(self, state):
+ # Check for old instances of StemFilter class, which didn't have a
+ # cachesize attribute and pickled the cache attribute
+ if "cachesize" not in state:
+ self.cachesize = 10000
+ if "ignores" in state:
+ self.ignore = state["ignores"]
+ elif "ignore" not in state:
+ self.ignore = frozenset()
+ if "cache" in state:
+ del state["cache"]
+
+ self.__dict__.update(state)
+ # Set the _stem attribute
+ self._stem = self._get_stemmer_fn()
+
+
+class DoubleMetaphoneFilter(Filter):
+ """Transforms the text of the tokens using Lawrence Philips's Double
+ Metaphone algorithm. This algorithm attempts to encode words in such a way
+ that similar-sounding words reduce to the same code. This may be useful for
+ fields containing the names of people and places, and other uses where
+ tolerance of spelling differences is desireable.
+ """
+
+ is_morph = True
+
+ def __init__(self, primary_boost=1.0, secondary_boost=0.5, combine=False):
+ """
+ :param primary_boost: the boost to apply to the token containing the
+ primary code.
+ :param secondary_boost: the boost to apply to the token containing the
+ secondary code, if any.
+ :param combine: if True, the original unencoded tokens are kept in the
+ stream, preceding the encoded tokens.
+ """
+
+ self.primary_boost = primary_boost
+ self.secondary_boost = secondary_boost
+ self.combine = combine
+
+ def __eq__(self, other):
+ return (other
+ and self.__class__ is other.__class__
+ and self.primary_boost == other.primary_boost)
+
+ def __call__(self, tokens):
+ primary_boost = self.primary_boost
+ secondary_boost = self.secondary_boost
+ combine = self.combine
+
+ for t in tokens:
+ if combine:
+ yield t
+
+ primary, secondary = double_metaphone(t.text)
+ b = t.boost
+ # Overwrite the token's text and boost and yield it
+ if primary:
+ t.text = primary
+ t.boost = b * primary_boost
+ yield t
+ if secondary:
+ t.text = secondary
+ t.boost = b * secondary_boost
+ yield t
diff --git a/src/whoosh/analysis/ngrams.py b/src/whoosh/analysis/ngrams.py
new file mode 100644
index 0000000..a57fcde
--- /dev/null
+++ b/src/whoosh/analysis/ngrams.py
@@ -0,0 +1,237 @@
+# Copyright 2007 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+from whoosh.compat import text_type
+from whoosh.compat import xrange
+from whoosh.analysis.acore import Token
+from whoosh.analysis.filters import Filter, LowercaseFilter
+from whoosh.analysis.tokenizers import Tokenizer, RegexTokenizer
+
+
+# Tokenizer
+
+class NgramTokenizer(Tokenizer):
+ """Splits input text into N-grams instead of words.
+
+ >>> ngt = NgramTokenizer(4)
+ >>> [token.text for token in ngt("hi there")]
+ ["hi t", "i th", " the", "ther", "here"]
+
+ Note that this tokenizer does NOT use a regular expression to extract
+ words, so the grams emitted by it will contain whitespace, punctuation,
+ etc. You may want to massage the input or add a custom filter to this
+ tokenizer's output.
+
+ Alternatively, if you only want sub-word grams without whitespace, you
+ could combine a RegexTokenizer with NgramFilter instead.
+ """
+
+ __inittypes__ = dict(minsize=int, maxsize=int)
+
+ def __init__(self, minsize, maxsize=None):
+ """
+ :param minsize: The minimum size of the N-grams.
+ :param maxsize: The maximum size of the N-grams. If you omit
+ this parameter, maxsize == minsize.
+ """
+
+ self.min = minsize
+ self.max = maxsize or minsize
+
+ def __eq__(self, other):
+ if self.__class__ is other.__class__:
+ if self.min == other.min and self.max == other.max:
+ return True
+ return False
+
+ def __call__(self, value, positions=False, chars=False, keeporiginal=False,
+ removestops=True, start_pos=0, start_char=0, mode='',
+ **kwargs):
+ assert isinstance(value, text_type), "%r is not unicode" % value
+
+ inlen = len(value)
+ t = Token(positions, chars, removestops=removestops, mode=mode)
+ pos = start_pos
+
+ if mode == "query":
+ size = min(self.max, inlen)
+ for start in xrange(0, inlen - size + 1):
+ end = start + size
+ if end > inlen:
+ continue
+ t.text = value[start:end]
+ if keeporiginal:
+ t.original = t.text
+ t.stopped = False
+ if positions:
+ t.pos = pos
+ if chars:
+ t.startchar = start_char + start
+ t.endchar = start_char + end
+ yield t
+ pos += 1
+ else:
+ for start in xrange(0, inlen - self.min + 1):
+ for size in xrange(self.min, self.max + 1):
+ end = start + size
+ if end > inlen:
+ continue
+ t.text = value[start:end]
+ if keeporiginal:
+ t.original = t.text
+ t.stopped = False
+ if positions:
+ t.pos = pos
+ if chars:
+ t.startchar = start_char + start
+ t.endchar = start_char + end
+
+ yield t
+ pos += 1
+
+
+# Filter
+
+class NgramFilter(Filter):
+ """Splits token text into N-grams.
+
+ >>> rext = RegexTokenizer()
+ >>> stream = rext("hello there")
+ >>> ngf = NgramFilter(4)
+ >>> [token.text for token in ngf(stream)]
+ ["hell", "ello", "ther", "here"]
+ """
+
+ __inittypes__ = dict(minsize=int, maxsize=int)
+
+ def __init__(self, minsize, maxsize=None, at=None):
+ """
+ :param minsize: The minimum size of the N-grams.
+ :param maxsize: The maximum size of the N-grams. If you omit this
+ parameter, maxsize == minsize.
+ :param at: If 'start', only take N-grams from the start of each word.
+ if 'end', only take N-grams from the end of each word. Otherwise,
+ take all N-grams from the word (the default).
+ """
+
+ self.min = minsize
+ self.max = maxsize or minsize
+ self.at = 0
+ if at == "start":
+ self.at = -1
+ elif at == "end":
+ self.at = 1
+
+ def __eq__(self, other):
+ return other and self.__class__ is other.__class__\
+ and self.min == other.min and self.max == other.max
+
+ def __call__(self, tokens):
+ assert hasattr(tokens, "__iter__")
+ at = self.at
+ for t in tokens:
+ text = t.text
+ if len(text) < self.min:
+ continue
+
+ chars = t.chars
+ if chars:
+ startchar = t.startchar
+ # Token positions don't mean much for N-grams,
+ # so we'll leave the token's original position
+ # untouched.
+
+ if t.mode == "query":
+ size = min(self.max, len(t.text))
+ if at == -1:
+ t.text = text[:size]
+ if chars:
+ t.endchar = startchar + size
+ yield t
+ elif at == 1:
+ t.text = text[0 - size:]
+ if chars:
+ t.startchar = t.endchar - size
+ yield t
+ else:
+ for start in xrange(0, len(text) - size + 1):
+ t.text = text[start:start + size]
+ if chars:
+ t.startchar = startchar + start
+ t.endchar = startchar + start + size
+ yield t
+ else:
+ if at == -1:
+ limit = min(self.max, len(text))
+ for size in xrange(self.min, limit + 1):
+ t.text = text[:size]
+ if chars:
+ t.endchar = startchar + size
+ yield t
+
+ elif at == 1:
+ if chars:
+ original_startchar = t.startchar
+ start = max(0, len(text) - self.max)
+ for i in xrange(start, len(text) - self.min + 1):
+ t.text = text[i:]
+ if chars:
+ t.startchar = original_startchar + i
+ yield t
+ else:
+ for start in xrange(0, len(text) - self.min + 1):
+ for size in xrange(self.min, self.max + 1):
+ end = start + size
+ if end > len(text):
+ continue
+
+ t.text = text[start:end]
+
+ if chars:
+ t.startchar = startchar + start
+ t.endchar = startchar + end
+
+ yield t
+
+
+# Analyzers
+
+def NgramAnalyzer(minsize, maxsize=None):
+ """Composes an NgramTokenizer and a LowercaseFilter.
+
+ >>> ana = NgramAnalyzer(4)
+ >>> [token.text for token in ana("hi there")]
+ ["hi t", "i th", " the", "ther", "here"]
+ """
+
+ return NgramTokenizer(minsize, maxsize=maxsize) | LowercaseFilter()
+
+
+def NgramWordAnalyzer(minsize, maxsize=None, tokenizer=None, at=None):
+ if not tokenizer:
+ tokenizer = RegexTokenizer()
+ return tokenizer | LowercaseFilter() | NgramFilter(minsize, maxsize, at=at)
diff --git a/src/whoosh/analysis/tokenizers.py b/src/whoosh/analysis/tokenizers.py
new file mode 100644
index 0000000..630ad46
--- /dev/null
+++ b/src/whoosh/analysis/tokenizers.py
@@ -0,0 +1,338 @@
+# Copyright 2007 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+from whoosh.compat import u, text_type
+from whoosh.analysis.acore import Composable, Token
+from whoosh.util.text import rcompile
+
+
+default_pattern = rcompile(r"\w+(\.?\w+)*")
+
+
+# Tokenizers
+
+
+class Tokenizer(Composable):
+ """Base class for Tokenizers.
+ """
+
+ def __eq__(self, other):
+ return other and self.__class__ is other.__class__
+
+
+class IDTokenizer(Tokenizer):
+ """Yields the entire input string as a single token. For use in indexed but
+ untokenized fields, such as a document's path.
+
+ >>> idt = IDTokenizer()
+ >>> [token.text for token in idt("/a/b 123 alpha")]
+ ["/a/b 123 alpha"]
+ """
+
+ def __call__(self, value, positions=False, chars=False,
+ keeporiginal=False, removestops=True,
+ start_pos=0, start_char=0, mode='', **kwargs):
+ assert isinstance(value, text_type), "%r is not unicode" % value
+ t = Token(positions, chars, removestops=removestops, mode=mode,
+ **kwargs)
+ t.text = value
+ t.boost = 1.0
+ if keeporiginal:
+ t.original = value
+ if positions:
+ t.pos = start_pos + 1
+ if chars:
+ t.startchar = start_char
+ t.endchar = start_char + len(value)
+ yield t
+
+
+class RegexTokenizer(Tokenizer):
+ """
+ Uses a regular expression to extract tokens from text.
+
+ >>> rex = RegexTokenizer()
+ >>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))]
+ ["hi", "there", "3.141", "big", "time", "under_score"]
+ """
+
+ def __init__(self, expression=default_pattern, gaps=False):
+ """
+ :param expression: A regular expression object or string. Each match
+ of the expression equals a token. Group 0 (the entire matched text)
+ is used as the text of the token. If you require more complicated
+ handling of the expression match, simply write your own tokenizer.
+ :param gaps: If True, the tokenizer *splits* on the expression, rather
+ than matching on the expression.
+ """
+
+ self.expression = rcompile(expression)
+ self.gaps = gaps
+
+ def __eq__(self, other):
+ if self.__class__ is other.__class__:
+ if self.expression.pattern == other.expression.pattern:
+ return True
+ return False
+
+ def __call__(self, value, positions=False, chars=False, keeporiginal=False,
+ removestops=True, start_pos=0, start_char=0, tokenize=True,
+ mode='', **kwargs):
+ """
+ :param value: The unicode string to tokenize.
+ :param positions: Whether to record token positions in the token.
+ :param chars: Whether to record character offsets in the token.
+ :param start_pos: The position number of the first token. For example,
+ if you set start_pos=2, the tokens will be numbered 2,3,4,...
+ instead of 0,1,2,...
+ :param start_char: The offset of the first character of the first
+ token. For example, if you set start_char=2, the text "aaa bbb"
+ will have chars (2,5),(6,9) instead (0,3),(4,7).
+ :param tokenize: if True, the text should be tokenized.
+ """
+
+ assert isinstance(value, text_type), "%s is not unicode" % repr(value)
+
+ t = Token(positions, chars, removestops=removestops, mode=mode,
+ **kwargs)
+ if not tokenize:
+ t.original = t.text = value
+ t.boost = 1.0
+ if positions:
+ t.pos = start_pos
+ if chars:
+ t.startchar = start_char
+ t.endchar = start_char + len(value)
+ yield t
+ elif not self.gaps:
+ # The default: expression matches are used as tokens
+ for pos, match in enumerate(self.expression.finditer(value)):
+ t.text = match.group(0)
+ t.boost = 1.0
+ if keeporiginal:
+ t.original = t.text
+ t.stopped = False
+ if positions:
+ t.pos = start_pos + pos
+ if chars:
+ t.startchar = start_char + match.start()
+ t.endchar = start_char + match.end()
+ yield t
+ else:
+ # When gaps=True, iterate through the matches and
+ # yield the text between them.
+ prevend = 0
+ pos = start_pos
+ for match in self.expression.finditer(value):
+ start = prevend
+ end = match.start()
+ text = value[start:end]
+ if text:
+ t.text = text
+ t.boost = 1.0
+ if keeporiginal:
+ t.original = t.text
+ t.stopped = False
+ if positions:
+ t.pos = pos
+ pos += 1
+ if chars:
+ t.startchar = start_char + start
+ t.endchar = start_char + end
+
+ yield t
+
+ prevend = match.end()
+
+ # If the last "gap" was before the end of the text,
+ # yield the last bit of text as a final token.
+ if prevend < len(value):
+ t.text = value[prevend:]
+ t.boost = 1.0
+ if keeporiginal:
+ t.original = t.text
+ t.stopped = False
+ if positions:
+ t.pos = pos
+ if chars:
+ t.startchar = prevend
+ t.endchar = len(value)
+ yield t
+
+
+class CharsetTokenizer(Tokenizer):
+ """Tokenizes and translates text according to a character mapping object.
+ Characters that map to None are considered token break characters. For all
+ other characters the map is used to translate the character. This is useful
+ for case and accent folding.
+
+ This tokenizer loops character-by-character and so will likely be much
+ slower than :class:`RegexTokenizer`.
+
+ One way to get a character mapping object is to convert a Sphinx charset
+ table file using :func:`whoosh.support.charset.charset_table_to_dict`.
+
+ >>> from whoosh.support.charset import charset_table_to_dict
+ >>> from whoosh.support.charset import default_charset
+ >>> charmap = charset_table_to_dict(default_charset)
+ >>> chtokenizer = CharsetTokenizer(charmap)
+ >>> [t.text for t in chtokenizer(u'Stra\\xdfe ABC')]
+ [u'strase', u'abc']
+
+ The Sphinx charset table format is described at
+ http://www.sphinxsearch.com/docs/current.html#conf-charset-table.
+ """
+
+ __inittype__ = dict(charmap=str)
+
+ def __init__(self, charmap):
+ """
+ :param charmap: a mapping from integer character numbers to unicode
+ characters, as used by the unicode.translate() method.
+ """
+ self.charmap = charmap
+
+ def __eq__(self, other):
+ return (other
+ and self.__class__ is other.__class__
+ and self.charmap == other.charmap)
+
+ def __call__(self, value, positions=False, chars=False, keeporiginal=False,
+ removestops=True, start_pos=0, start_char=0, tokenize=True,
+ mode='', **kwargs):
+ """
+ :param value: The unicode string to tokenize.
+ :param positions: Whether to record token positions in the token.
+ :param chars: Whether to record character offsets in the token.
+ :param start_pos: The position number of the first token. For example,
+ if you set start_pos=2, the tokens will be numbered 2,3,4,...
+ instead of 0,1,2,...
+ :param start_char: The offset of the first character of the first
+ token. For example, if you set start_char=2, the text "aaa bbb"
+ will have chars (2,5),(6,9) instead (0,3),(4,7).
+ :param tokenize: if True, the text should be tokenized.
+ """
+
+ assert isinstance(value, text_type), "%r is not unicode" % value
+
+ t = Token(positions, chars, removestops=removestops, mode=mode,
+ **kwargs)
+ if not tokenize:
+ t.original = t.text = value
+ t.boost = 1.0
+ if positions:
+ t.pos = start_pos
+ if chars:
+ t.startchar = start_char
+ t.endchar = start_char + len(value)
+ yield t
+ else:
+ text = u("")
+ charmap = self.charmap
+ pos = start_pos
+ startchar = currentchar = start_char
+ for char in value:
+ tchar = charmap[ord(char)]
+ if tchar:
+ text += tchar
+ else:
+ if currentchar > startchar:
+ t.text = text
+ t.boost = 1.0
+ if keeporiginal:
+ t.original = t.text
+ if positions:
+ t.pos = pos
+ pos += 1
+ if chars:
+ t.startchar = startchar
+ t.endchar = currentchar
+ yield t
+ startchar = currentchar + 1
+ text = u("")
+
+ currentchar += 1
+
+ if currentchar > startchar:
+ t.text = value[startchar:currentchar]
+ t.boost = 1.0
+ if keeporiginal:
+ t.original = t.text
+ if positions:
+ t.pos = pos
+ if chars:
+ t.startchar = startchar
+ t.endchar = currentchar
+ yield t
+
+
+def SpaceSeparatedTokenizer():
+ """Returns a RegexTokenizer that splits tokens by whitespace.
+
+ >>> sst = SpaceSeparatedTokenizer()
+ >>> [token.text for token in sst("hi there big-time, what's up")]
+ ["hi", "there", "big-time,", "what's", "up"]
+ """
+
+ return RegexTokenizer(r"[^ \t\r\n]+")
+
+
+def CommaSeparatedTokenizer():
+ """Splits tokens by commas.
+
+ Note that the tokenizer calls unicode.strip() on each match of the regular
+ expression.
+
+ >>> cst = CommaSeparatedTokenizer()
+ >>> [token.text for token in cst("hi there, what's , up")]
+ ["hi there", "what's", "up"]
+ """
+
+ from whoosh.analysis.filters import StripFilter
+
+ return RegexTokenizer(r"[^,]+") | StripFilter()
+
+
+class PathTokenizer(Tokenizer):
+ """A simple tokenizer that given a string ``"/a/b/c"`` yields tokens
+ ``["/a", "/a/b", "/a/b/c"]``.
+ """
+
+ def __init__(self, expression="[^/]+"):
+ self.expr = rcompile(expression)
+
+ def __call__(self, value, positions=False, start_pos=0, **kwargs):
+ assert isinstance(value, text_type), "%r is not unicode" % value
+ token = Token(positions, **kwargs)
+ pos = start_pos
+ for match in self.expr.finditer(value):
+ token.text = value[:match.end()]
+ if positions:
+ token.pos = pos
+ pos += 1
+ yield token
+
diff --git a/src/whoosh/automata/__init__.py b/src/whoosh/automata/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/whoosh/automata/__init__.py
diff --git a/src/whoosh/automata/fsa.py b/src/whoosh/automata/fsa.py
new file mode 100644
index 0000000..187e562
--- /dev/null
+++ b/src/whoosh/automata/fsa.py
@@ -0,0 +1,714 @@
+from __future__ import print_function
+
+import itertools
+import operator
+import sys
+from bisect import bisect_left
+from collections import defaultdict
+
+from whoosh.compat import iteritems, next, text_type, unichr, xrange
+
+
+unull = unichr(0)
+
+
+# Marker constants
+
+class Marker(object):
+ def __init__(self, name):
+ self.name = name
+
+ def __repr__(self):
+ return "<%s>" % self.name
+
+
+EPSILON = Marker("EPSILON")
+ANY = Marker("ANY")
+
+
+# Base class
+
+class FSA(object):
+ def __init__(self, initial):
+ self.initial = initial
+ self.transitions = {}
+ self.final_states = set()
+
+ def __len__(self):
+ return len(self.all_states())
+
+ def __eq__(self, other):
+ if self.initial != other.initial:
+ return False
+ if self.final_states != other.final_states:
+ return False
+ st = self.transitions
+ ot = other.transitions
+ if list(st) != list(ot):
+ return False
+ for key in st:
+ if st[key] != ot[key]:
+ return False
+ return True
+
+ def all_states(self):
+ stateset = set(self.transitions)
+ for src, trans in iteritems(self.transitions):
+ stateset.update(trans.values())
+ return stateset
+
+ def all_labels(self):
+ labels = set()
+ for src, trans in iteritems(self.transitions):
+ labels.update(trans)
+ return labels
+
+ def get_labels(self, src):
+ return iter(self.transitions.get(src, []))
+
+ def generate_all(self, state=None, sofar=""):
+ state = self.start() if state is None else state
+ if self.is_final(state):
+ yield sofar
+ for label in sorted(self.get_labels(state)):
+ newstate = self.next_state(state, label)
+ for string in self.generate_all(newstate, sofar + label):
+ yield string
+
+ def start(self):
+ return self.initial
+
+ def next_state(self, state, label):
+ raise NotImplementedError
+
+ def is_final(self, state):
+ raise NotImplementedError
+
+ def add_transition(self, src, label, dest):
+ raise NotImplementedError
+
+ def add_final_state(self, state):
+ raise NotImplementedError
+
+ def to_dfa(self):
+ raise NotImplementedError
+
+ def accept(self, string, debug=False):
+ state = self.start()
+
+ for label in string:
+ if debug:
+ print(" ", state, "->", label, "->")
+
+ state = self.next_state(state, label)
+ if not state:
+ break
+
+ return self.is_final(state)
+
+ def append(self, fsa):
+ self.transitions.update(fsa.transitions)
+ for state in self.final_states:
+ self.add_transition(state, EPSILON, fsa.initial)
+ self.final_states = fsa.final_states
+
+
+# Implementations
+
+class NFA(FSA):
+ def __init__(self, initial):
+ self.transitions = {}
+ self.final_states = set()
+ self.initial = initial
+
+ def dump(self, stream=sys.stdout):
+ starts = self.start()
+ for src in self.transitions:
+ beg = "@" if src in starts else " "
+ print(beg, src, file=stream)
+ xs = self.transitions[src]
+ for label in xs:
+ dests = xs[label]
+ end = "||" if self.is_final(dests) else ""
+
+ def start(self):
+ return frozenset(self._expand(set([self.initial])))
+
+ def add_transition(self, src, label, dest):
+ self.transitions.setdefault(src, {}).setdefault(label, set()).add(dest)
+
+ def add_final_state(self, state):
+ self.final_states.add(state)
+
+ def triples(self):
+ for src, trans in iteritems(self.transitions):
+ for label, dests in iteritems(trans):
+ for dest in dests:
+ yield src, label, dest
+
+ def is_final(self, states):
+ return bool(self.final_states.intersection(states))
+
+ def _expand(self, states):
+ transitions = self.transitions
+ frontier = set(states)
+ while frontier:
+ state = frontier.pop()
+ if state in transitions and EPSILON in transitions[state]:
+ new_states = transitions[state][EPSILON].difference(states)
+ frontier.update(new_states)
+ states.update(new_states)
+ return states
+
+ def next_state(self, states, label):
+ transitions = self.transitions
+ dest_states = set()
+ for state in states:
+ if state in transitions:
+ xs = transitions[state]
+ if label in xs:
+ dest_states.update(xs[label])
+ if ANY in xs:
+ dest_states.update(xs[ANY])
+ return frozenset(self._expand(dest_states))
+
+ def get_labels(self, states):
+ transitions = self.transitions
+ labels = set()
+ for state in states:
+ if state in transitions:
+ labels.update(transitions[state])
+ return labels
+
+ def embed(self, other):
+ # Copy all transitions from the other NFA into this one
+ for s, othertrans in iteritems(other.transitions):
+ trans = self.transitions.setdefault(s, {})
+ for label, otherdests in iteritems(othertrans):
+ dests = trans.setdefault(label, set())
+ dests.update(otherdests)
+
+ def insert(self, src, other, dest):
+ self.embed(other)
+
+ # Connect src to the other NFA's initial state, and the other
+ # NFA's final states to dest
+ self.add_transition(src, EPSILON, other.initial)
+ for finalstate in other.final_states:
+ self.add_transition(finalstate, EPSILON, dest)
+
+ def to_dfa(self):
+ dfa = DFA(self.start())
+ frontier = [self.start()]
+ seen = set()
+ while frontier:
+ current = frontier.pop()
+ if self.is_final(current):
+ dfa.add_final_state(current)
+ labels = self.get_labels(current)
+ for label in labels:
+ if label is EPSILON:
+ continue
+ new_state = self.next_state(current, label)
+ if new_state not in seen:
+ frontier.append(new_state)
+ seen.add(new_state)
+ if self.is_final(new_state):
+ dfa.add_final_state(new_state)
+ if label is ANY:
+ dfa.set_default_transition(current, new_state)
+ else:
+ dfa.add_transition(current, label, new_state)
+ return dfa
+
+
+class DFA(FSA):
+ def __init__(self, initial):
+ self.initial = initial
+ self.transitions = {}
+ self.defaults = {}
+ self.final_states = set()
+ self.outlabels = {}
+
+ def dump(self, stream=sys.stdout):
+ for src in sorted(self.transitions):
+ beg = "@" if src == self.initial else " "
+ print(beg, src, file=stream)
+ xs = self.transitions[src]
+ for label in sorted(xs):
+ dest = xs[label]
+ end = "||" if self.is_final(dest) else ""
+
+ def start(self):
+ return self.initial
+
+ def add_transition(self, src, label, dest):
+ self.transitions.setdefault(src, {})[label] = dest
+
+ def set_default_transition(self, src, dest):
+ self.defaults[src] = dest
+
+ def add_final_state(self, state):
+ self.final_states.add(state)
+
+ def is_final(self, state):
+ return state in self.final_states
+
+ def next_state(self, src, label):
+ trans = self.transitions.get(src, {})
+ return trans.get(label, self.defaults.get(src, None))
+
+ def next_valid_string(self, string, asbytes=False):
+ state = self.start()
+ stack = []
+
+ # Follow the DFA as far as possible
+ for i, label in enumerate(string):
+ stack.append((string[:i], state, label))
+ state = self.next_state(state, label)
+ if not state:
+ break
+ else:
+ stack.append((string[:i + 1], state, None))
+
+ if self.is_final(state):
+ # Word is already valid
+ return string
+
+ # Perform a 'wall following' search for the lexicographically smallest
+ # accepting state.
+ while stack:
+ path, state, label = stack.pop()
+ label = self.find_next_edge(state, label, asbytes=asbytes)
+ if label:
+ path += label
+ state = self.next_state(state, label)
+ if self.is_final(state):
+ return path
+ stack.append((path, state, None))
+ return None
+
+ def find_next_edge(self, s, label, asbytes):
+ if label is None:
+ label = b"\x00" if asbytes else u'\0'
+ else:
+ label = (label + 1) if asbytes else unichr(ord(label) + 1)
+ trans = self.transitions.get(s, {})
+ if label in trans or s in self.defaults:
+ return label
+
+ try:
+ labels = self.outlabels[s]
+ except KeyError:
+ self.outlabels[s] = labels = sorted(trans)
+
+ pos = bisect_left(labels, label)
+ if pos < len(labels):
+ return labels[pos]
+ return None
+
+ def reachable_from(self, src, inclusive=True):
+ transitions = self.transitions
+
+ reached = set()
+ if inclusive:
+ reached.add(src)
+
+ stack = [src]
+ seen = set()
+ while stack:
+ src = stack.pop()
+ seen.add(src)
+ for _, dest in iteritems(transitions[src]):
+ reached.add(dest)
+ if dest not in seen:
+ stack.append(dest)
+ return reached
+
+ def minimize(self):
+ transitions = self.transitions
+ initial = self.initial
+
+ # Step 1: Delete unreachable states
+ reachable = self.reachable_from(initial)
+ for src in list(transitions):
+ if src not in reachable:
+ del transitions[src]
+ final_states = self.final_states.intersection(reachable)
+ labels = self.all_labels()
+
+ # Step 2: Partition the states into equivalence sets
+ changed = True
+ parts = [final_states, reachable - final_states]
+ while changed:
+ changed = False
+ for i in xrange(len(parts)):
+ part = parts[i]
+ changed_part = False
+ for label in labels:
+ next_part = None
+ new_part = set()
+ for state in part:
+ dest = transitions[state].get(label)
+ if dest is not None:
+ if next_part is None:
+ for p in parts:
+ if dest in p:
+ next_part = p
+ elif dest not in next_part:
+ new_part.add(state)
+ changed = True
+ changed_part = True
+ if changed_part:
+ old_part = part - new_part
+ parts.pop(i)
+ parts.append(old_part)
+ parts.append(new_part)
+ break
+
+ # Choose one state from each equivalence set and map all equivalent
+ # states to it
+ new_trans = {}
+
+ # Create mapping
+ mapping = {}
+ new_initial = None
+ for part in parts:
+ representative = part.pop()
+ if representative is initial:
+ new_initial = representative
+ mapping[representative] = representative
+ new_trans[representative] = {}
+ for state in part:
+ if state is initial:
+ new_initial = representative
+ mapping[state] = representative
+ assert new_initial is not None
+
+ # Apply mapping to existing transitions
+ new_finals = set(mapping[s] for s in final_states)
+ for state, d in iteritems(new_trans):
+ trans = transitions[state]
+ for label, dest in iteritems(trans):
+ d[label] = mapping[dest]
+
+ # Remove dead states - non-final states with no outgoing arcs except
+ # to themselves
+ non_final_srcs = [src for src in new_trans if src not in new_finals]
+ removing = set()
+ for src in non_final_srcs:
+ dests = set(new_trans[src].values())
+ dests.discard(src)
+ if not dests:
+ removing.add(src)
+ del new_trans[src]
+ # Delete transitions to removed dead states
+ for t in new_trans.values():
+ for label in list(t):
+ if t[label] in removing:
+ del t[label]
+
+ self.transitions = new_trans
+ self.initial = new_initial
+ self.final_states = new_finals
+
+ def to_dfa(self):
+ return self
+
+
+# Useful functions
+
+def renumber_dfa(dfa, base=0):
+ c = itertools.count(base)
+ mapping = {}
+
+ def remap(state):
+ if state in mapping:
+ newnum = mapping[state]
+ else:
+ newnum = next(c)
+ mapping[state] = newnum
+ return newnum
+
+ newdfa = DFA(remap(dfa.initial))
+ for src, trans in iteritems(dfa.transitions):
+ for label, dest in iteritems(trans):
+ newdfa.add_transition(remap(src), label, remap(dest))
+ for finalstate in dfa.final_states:
+ newdfa.add_final_state(remap(finalstate))
+ for src, dest in iteritems(dfa.defaults):
+ newdfa.set_default_transition(remap(src), remap(dest))
+ return newdfa
+
+
+def u_to_utf8(dfa, base=0):
+ c = itertools.count(base)
+ transitions = dfa.transitions
+
+ for src, trans in iteritems(transitions):
+ trans = transitions[src]
+ for label, dest in list(iteritems(trans)):
+ if label is EPSILON:
+ continue
+ elif label is ANY:
+ raise Exception
+ else:
+ assert isinstance(label, text_type)
+ label8 = label.encode("utf8")
+ for i, byte in enumerate(label8):
+ if i < len(label8) - 1:
+ st = next(c)
+ dfa.add_transition(src, byte, st)
+ src = st
+ else:
+ dfa.add_transition(src, byte, dest)
+ del trans[label]
+
+
+def find_all_matches(dfa, lookup_func, first=unull):
+ """
+ Uses lookup_func to find all words within levenshtein distance k of word.
+
+ Args:
+ word: The word to look up
+ k: Maximum edit distance
+ lookup_func: A single argument function that returns the first word in the
+ database that is greater than or equal to the input argument.
+ Yields:
+ Every matching word within levenshtein distance k from the database.
+ """
+
+ match = dfa.next_valid_string(first)
+ while match:
+ key = lookup_func(match)
+ if key is None:
+ return
+ if match == key:
+ yield match
+ key += unull
+ match = dfa.next_valid_string(key)
+
+
+# Construction functions
+
+def reverse_nfa(n):
+ s = object()
+ nfa = NFA(s)
+ for src, trans in iteritems(n.transitions):
+ for label, destset in iteritems(trans):
+ for dest in destset:
+ nfa.add_transition(dest, label, src)
+ for finalstate in n.final_states:
+ nfa.add_transition(s, EPSILON, finalstate)
+ nfa.add_final_state(n.initial)
+ return nfa
+
+
+def product(dfa1, op, dfa2):
+ dfa1 = dfa1.to_dfa()
+ dfa2 = dfa2.to_dfa()
+ start = (dfa1.start(), dfa2.start())
+ dfa = DFA(start)
+ stack = [start]
+ while stack:
+ src = stack.pop()
+ state1, state2 = src
+ trans1 = set(dfa1.transitions[state1])
+ trans2 = set(dfa2.transitions[state2])
+ for label in trans1.intersection(trans2):
+ state1 = dfa1.next_state(state1, label)
+ state2 = dfa2.next_state(state2, label)
+ if op(state1 is not None, state2 is not None):
+ dest = (state1, state2)
+ dfa.add_transition(src, label, dest)
+ stack.append(dest)
+ if op(dfa1.is_final(state1), dfa2.is_final(state2)):
+ dfa.add_final_state(dest)
+ return dfa
+
+
+def intersection(dfa1, dfa2):
+ return product(dfa1, operator.and_, dfa2)
+
+
+def union(dfa1, dfa2):
+ return product(dfa1, operator.or_, dfa2)
+
+
+def epsilon_nfa():
+ return basic_nfa(EPSILON)
+
+
+def dot_nfa():
+ return basic_nfa(ANY)
+
+
+def basic_nfa(label):
+ s = object()
+ e = object()
+ nfa = NFA(s)
+ nfa.add_transition(s, label, e)
+ nfa.add_final_state(e)
+ return nfa
+
+
+def charset_nfa(labels):
+ s = object()
+ e = object()
+ nfa = NFA(s)
+ for label in labels:
+ nfa.add_transition(s, label, e)
+ nfa.add_final_state(e)
+ return nfa
+
+
+def string_nfa(string):
+ s = object()
+ e = object()
+ nfa = NFA(s)
+ for label in string:
+ e = object()
+ nfa.add_transition(s, label, e)
+ s = e
+ nfa.add_final_state(e)
+ return nfa
+
+
+def choice_nfa(n1, n2):
+ s = object()
+ e = object()
+ nfa = NFA(s)
+ # -> nfa1 -
+ # / \
+ # s e
+ # \ /
+ # -> nfa2 -
+ nfa.insert(s, n1, e)
+ nfa.insert(s, n2, e)
+ nfa.add_final_state(e)
+ return nfa
+
+
+def concat_nfa(n1, n2):
+ s = object()
+ m = object()
+ e = object()
+ nfa = NFA(s)
+ nfa.insert(s, n1, m)
+ nfa.insert(m, n2, e)
+ nfa.add_final_state(e)
+ return nfa
+
+
+def star_nfa(n):
+ s = object()
+ e = object()
+ nfa = NFA(s)
+ # -----<-----
+ # / \
+ # s ---> n ---> e
+ # \ /
+ # ----->-----
+
+ nfa.insert(s, n, e)
+ nfa.add_transition(s, EPSILON, e)
+ for finalstate in n.final_states:
+ nfa.add_transition(finalstate, EPSILON, s)
+ nfa.add_final_state(e)
+ return nfa
+
+
+def plus_nfa(n):
+ return concat_nfa(n, star_nfa(n))
+
+
+def optional_nfa(n):
+ return choice_nfa(n, epsilon_nfa())
+
+
+# Daciuk Mihov DFA construction algorithm
+
+class DMNode(object):
+ def __init__(self, n):
+ self.n = n
+ self.arcs = {}
+ self.final = False
+
+ def __repr__(self):
+ return "<%s, %r>" % (self.n, self.tuple())
+
+ def __hash__(self):
+ return hash(self.tuple())
+
+ def tuple(self):
+ arcs = tuple(sorted(iteritems(self.arcs)))
+ return arcs, self.final
+
+
+def strings_dfa(strings):
+ dfa = DFA(0)
+ c = itertools.count(1)
+
+ last = ""
+ seen = {}
+ nodes = [DMNode(0)]
+
+ for string in strings:
+ if string <= last:
+ raise Exception("Strings must be in order")
+ if not string:
+ raise Exception("Can't add empty string")
+
+ # Find the common prefix with the previous string
+ i = 0
+ while i < len(last) and i < len(string) and last[i] == string[i]:
+ i += 1
+ prefixlen = i
+
+ # Freeze the transitions after the prefix, since they're not shared
+ add_suffix(dfa, nodes, last, prefixlen + 1, seen)
+
+ # Create new nodes for the substring after the prefix
+ for label in string[prefixlen:]:
+ node = DMNode(next(c))
+ # Create an arc from the previous node to this node
+ nodes[-1].arcs[label] = node.n
+ nodes.append(node)
+ # Mark the last node as an accept state
+ nodes[-1].final = True
+
+ last = string
+
+ if len(nodes) > 1:
+ add_suffix(dfa, nodes, last, 0, seen)
+ return dfa
+
+
+def add_suffix(dfa, nodes, last, downto, seen):
+ while len(nodes) > downto:
+ node = nodes.pop()
+ tup = node.tuple()
+
+ # If a node just like this one (final/nonfinal, same arcs to same
+ # destinations) is already seen, replace with it
+ try:
+ this = seen[tup]
+ except KeyError:
+ this = node.n
+ if node.final:
+ dfa.add_final_state(this)
+ seen[tup] = this
+ else:
+ # If we replaced the node with an already seen one, fix the parent
+ # node's pointer to this
+ parent = nodes[-1]
+ inlabel = last[len(nodes) - 1]
+ parent.arcs[inlabel] = this
+
+ # Add the node's transitions to the DFA
+ for label, dest in iteritems(node.arcs):
+ dfa.add_transition(this, label, dest)
+
+
+
+
diff --git a/src/whoosh/automata/glob.py b/src/whoosh/automata/glob.py
new file mode 100644
index 0000000..b8fbc87
--- /dev/null
+++ b/src/whoosh/automata/glob.py
@@ -0,0 +1,90 @@
+# Copyright 2012 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+from whoosh.automata.fsa import ANY, EPSILON, NFA
+
+
+# Constants for glob
+_LIT = 0
+_STAR = 1
+_PLUS = 2
+_QUEST = 3
+_RANGE = 4
+
+
+def parse_glob(pattern, _glob_multi="*", _glob_single="?",
+ _glob_range1="[", _glob_range2="]"):
+ pos = 0
+ last = None
+ while pos < len(pattern):
+ char = pattern[pos]
+ pos += 1
+ if char == _glob_multi: # *
+ # (Ignore more than one star in a row)
+ if last is not _STAR:
+ yield _STAR, None
+ last = _STAR
+ elif char == _glob_single: # ?
+ # (Ignore ? after a star)
+ if last is not _STAR:
+ yield _QUEST, None
+ last = _QUEST
+ elif char == _glob_range1: # [
+ chars = set()
+ negate = False
+ # Take the char range specification until the ]
+ while pos < len(pattern):
+ char = pattern[pos]
+ pos += 1
+ if char == _glob_range2:
+ break
+ chars.add(char)
+ if chars:
+ yield _RANGE, (chars, negate)
+ last = _RANGE
+ else:
+ yield _LIT, char
+ last = _LIT
+
+
+def glob_automaton(pattern):
+ nfa = NFA(0)
+ i = -1
+ for i, (op, arg) in enumerate(parse_glob(pattern)):
+ if op is _LIT:
+ nfa.add_transition(i, arg, i + 1)
+ elif op is _STAR:
+ nfa.add_transition(i, ANY, i + 1)
+ nfa.add_transition(i, EPSILON, i + 1)
+ nfa.add_transition(i + 1, EPSILON, i)
+ elif op is _QUEST:
+ nfa.add_transition(i, ANY, i + 1)
+ elif op is _RANGE:
+ for char in arg[0]:
+ nfa.add_transition(i, char, i + 1)
+ nfa.add_final_state(i + 1)
+ return nfa
diff --git a/src/whoosh/automata/lev.py b/src/whoosh/automata/lev.py
new file mode 100644
index 0000000..7067c64
--- /dev/null
+++ b/src/whoosh/automata/lev.py
@@ -0,0 +1,30 @@
+from __future__ import print_function
+
+from whoosh.compat import unichr, xrange
+from whoosh.automata.fsa import ANY, EPSILON, NFA, unull
+
+
+def levenshtein_automaton(term, k, prefix=0):
+ nfa = NFA((0, 0))
+ if prefix:
+ for i in xrange(prefix):
+ c = term[i]
+ nfa.add_transition((i, 0), c, (i + 1, 0))
+
+ for i in xrange(prefix, len(term)):
+ c = term[i]
+ for e in xrange(k + 1):
+ # Correct character
+ nfa.add_transition((i, e), c, (i + 1, e))
+ if e < k:
+ # Deletion
+ nfa.add_transition((i, e), ANY, (i, e + 1))
+ # Insertion
+ nfa.add_transition((i, e), EPSILON, (i + 1, e + 1))
+ # Substitution
+ nfa.add_transition((i, e), ANY, (i + 1, e + 1))
+ for e in xrange(k + 1):
+ if e < k:
+ nfa.add_transition((len(term), e), ANY, (len(term), e + 1))
+ nfa.add_final_state((len(term), e))
+ return nfa
diff --git a/src/whoosh/automata/nfa.py b/src/whoosh/automata/nfa.py
new file mode 100644
index 0000000..6ea72be
--- /dev/null
+++ b/src/whoosh/automata/nfa.py
@@ -0,0 +1,388 @@
+# Copyright 2012 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+from whoosh.automata.fst import Arc
+
+
+class Instruction(object):
+ def __repr__(self):
+ return "%s()" % (self.__class__.__name__, )
+
+
+class Char(Instruction):
+ """
+ Matches a literal character.
+ """
+
+ def __init__(self, c):
+ self.c = c
+
+ def __repr__(self):
+ return "Char(%r)" % self.c
+
+class Lit(Instruction):
+ """
+ Matches a literal string.
+ """
+
+ def __init__(self, c):
+ self.c = c
+
+ def __repr__(self):
+ return "Lit(%r)" % self.c
+
+
+class Any(Instruction):
+ """
+ Matches any character.
+ """
+
+
+class Match(Instruction):
+ """
+ Stop this thread: the string matched.
+ """
+
+ def __repr__(self):
+ return "Match()"
+
+
+class Jmp(Instruction):
+ """
+ Jump to a specified instruction.
+ """
+
+ def __init__(self, x):
+ self.x = x
+
+ def __repr__(self):
+ return "Jmp(%s)" % self.x
+
+
+class Split(Instruction):
+ """
+ Split execution: continue at two separate specified instructions.
+ """
+
+ def __init__(self, x, y):
+ self.x = x
+ self.y = y
+
+ def __repr__(self):
+ return "Split(%s, %s)" % (self.x, self.y)
+
+
+class Label(Instruction):
+ """
+ Placeholder to act as a target for JMP instructions
+ """
+
+ def __hash__(self):
+ return id(self)
+
+ def __repr__(self):
+ return "L(%s)" % hex(id(self))
+
+
+def concat(e1, e2):
+ return e1 + e2
+
+
+def alt(e1, e2):
+ L1, L2, L3 = Label(), Label(), Label()
+ return [L1] + e1 + [Jmp(L3), L2] + e2 + [L3]
+
+
+def zero_or_one(e):
+ L1, L2 = Label(), Label()
+ return [Split(L1, L2), L1] + e + [L2]
+
+
+def zero_or_more(e):
+ L1, L2, L3 = Label(), Label(), Label()
+ return [L1, Split(L2, L3), L2] + e + [Jmp(L1), L3]
+
+
+def one_or_more(e):
+ L1, L2 = Label(), Label()
+ return [L1] + e + [Split(L1, L2), L2]
+
+
+def fixup(program):
+ refs = {}
+ i = 0
+ while i < len(program):
+ op = program[i]
+ if isinstance(op, Label):
+ refs[op] = i
+ program.pop(i)
+ else:
+ i += 1
+
+ if refs:
+ for op in program:
+ if isinstance(op, (Jmp, Split)):
+ op.x = refs[op.x]
+ if isinstance(op, Split):
+ op.y = refs[op.y]
+
+ return program + [Match]
+
+
+class ThreadList(object):
+ def __init__(self, program, max=1000):
+ self.program = program
+ self.max = max
+ self.threads = []
+
+ def __nonzero__(self):
+ return bool(self.threads)
+
+ def current(self):
+ return self.threads.pop()
+
+ def add(self, thread):
+ op = self.program[thread.pc]
+ optype = type(op)
+ if optype is Jmp:
+ self.add(thread.at(op.x))
+ elif optype is Split:
+ self.add(thread.copy_at(op.x))
+ self.add(thread.at(op.y))
+ else:
+ self.threads.append(thread)
+
+
+class Thread(object):
+ def __init__(self, pc, address, sofar='', accept=False):
+ self.pc = pc
+ self.address = address
+ self.sofar = sofar
+ self.accept = accept
+
+ def at(self, pc):
+ self.pc = pc
+ return self
+
+ def copy_at(self, pc):
+ return Thread(pc, self.address, self.sofar, self.accept)
+
+ def __repr__(self):
+ d = self.__dict__
+ return "Thread(%s)" % ",".join("%s=%r" % (k, v) for k, v in d.items())
+
+
+def advance(thread, arc, c):
+ thread.pc += 1
+ thread.address = arc.target
+ thread.sofar += c
+ thread.accept = arc.accept
+
+
+def run(graph, program, address):
+ threads = ThreadList(program)
+ threads.add(Thread(0, address))
+ arc = Arc()
+ while threads:
+ thread = threads.current()
+ address = thread.address
+ op = program[thread.pc]
+ optype = type(op)
+
+ if optype is Char:
+ if address:
+ arc = graph.find_arc(address, op.c, arc)
+ if arc:
+ advance(thread, arc)
+ threads.add(thread)
+ elif optype is Lit:
+ if address:
+ c = op.c
+ arc = graph.find_path(c, arc, address)
+ if arc:
+ advance(thread, arc, c)
+ threads.add(thread)
+ elif optype is Any:
+ if address:
+ sofar = thread.sofar
+ pc = thread.pc + 1
+ for arc in graph.iter_arcs(address, arc):
+ t = Thread(pc, arc.target, sofar + arc.label, arc.accept)
+ threads.add(t)
+ elif op is Match:
+ if thread.accept:
+ yield thread.sofar
+ else:
+ raise Exception("Don't know what to do with %r" % op)
+
+
+LO = 0
+HI = 1
+
+
+def regex_limit(graph, mode, program, address):
+ low = mode == LO
+ output = []
+ threads = ThreadList(program)
+ threads.add(Thread(0, address))
+ arc = Arc()
+ while threads:
+ thread = threads.current()
+ address = thread.address
+ op = program[thread.pc]
+ optype = type(op)
+
+ if optype is Char:
+ if address:
+ arc = graph.find_arc(address, op.c, arc)
+ if arc:
+ if low and arc.accept:
+ return thread.sofar + thread.label
+ advance(thread, arc)
+ threads.add(thread)
+ elif optype is Lit:
+ if address:
+ labels = op.c
+ for label in labels:
+ arc = graph.find_arc(address, label)
+ if arc is None:
+ return thread.sofar
+ elif thread.accept:
+ return thread.sofar
+ elif optype is Any:
+ if address:
+ if low:
+ arc = graph.arc_at(address, arc)
+ else:
+ for arc in graph.iter_arcs(address):
+ pass
+ advance(thread, arc, arc.label)
+ threads.add(thread)
+ elif thread.accept:
+ return thread.sofar
+ elif op is Match:
+ return thread.sofar
+ else:
+ raise Exception("Don't know what to do with %r" % op)
+
+
+# if __name__ == "__main__":
+# from whoosh import index, query
+# from whoosh.filedb.filestore import RamStorage
+# from whoosh.automata import fst
+# from whoosh.util.testing import timing
+#
+# st = RamStorage()
+# gw = fst.GraphWriter(st.create_file("test"))
+# gw.start_field("test")
+# for key in ["aaaa", "aaab", "aabb", "abbb", "babb", "bbab", "bbba"]:
+# gw.insert(key)
+# gw.close()
+# gr = fst.GraphReader(st.open_file("test"))
+#
+# program = one_or_more([Lit("a")])
+# print program
+# program = fixup(program)
+# print program
+# print list(run(gr, program, gr.root("test")))
+#
+# ix = index.open_dir("e:/dev/src/houdini/help/index")
+# r = ix.reader()
+# gr = r._get_graph()
+#
+# # program = fixup([Any(), Any(), Any(), Any(), Any()])
+# # program = fixup(concat(zero_or_more([Any()]), [Char("/")]))
+# # with timing():
+# # x = list(run(gr, program, gr.root("path")))
+# # print len(x)
+#
+# q = query.Regex("path", "^.[abc].*/$")
+# with timing():
+# y = list(q._btexts(r))
+# print len(y)
+# print y[0], y[-1]
+#
+# pr = [Any()] + alt([Lit("c")], alt([Lit("b")], [Lit("a")])) + zero_or_more([Any()]) + [Lit("/")]
+# program = fixup(pr)
+# # with timing():
+# # x = list(run(gr, program, gr.root("path")))
+# # print len(x), x
+#
+# with timing():
+# print "lo=", regex_limit(gr, LO, program, gr.root("path"))
+# print "hi=", regex_limit(gr, HI, program, gr.root("path"))
+#
+#
+#
+# #int
+# #backtrackingvm(Inst *prog, char *input)
+# #{
+# # enum { MAXTHREAD = 1000 };
+# # Thread ready[MAXTHREAD];
+# # int nready;
+# # Inst *pc;
+# # char *sp;
+# #
+# # /* queue initial thread */
+# # ready[0] = thread(prog, input);
+# # nready = 1;
+# #
+# # /* run threads in stack order */
+# # while(nready > 0){
+# # --nready; /* pop state for next thread to run */
+# # pc = ready[nready].pc;
+# # sp = ready[nready].sp;
+# # for(;;){
+# # switch(pc->opcode){
+# # case Char:
+# # if(*sp != pc->c)
+# # goto Dead;
+# # pc++;
+# # sp++;
+# # continue;
+# # case Match:
+# # return 1;
+# # case Jmp:
+# # pc = pc->x;
+# # continue;
+# # case Split:
+# # if(nready >= MAXTHREAD){
+# # fprintf(stderr, "regexp overflow");
+# # return -1;
+# # }
+# # /* queue new thread */
+# # ready[nready++] = thread(pc->y, sp);
+# # pc = pc->x; /* continue current thread */
+# # continue;
+# # }
+# # }
+# # Dead:;
+# # }
+# # return 0;
+# #}
+#
+#
diff --git a/src/whoosh/automata/reg.py b/src/whoosh/automata/reg.py
new file mode 100644
index 0000000..578071e
--- /dev/null
+++ b/src/whoosh/automata/reg.py
@@ -0,0 +1,135 @@
+# Copyright 2014 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+import re
+from whoosh.automata.fsa import ANY, EPSILON, NFA
+
+
+# Operator precedence
+CHOICE = ("|", )
+ops = ()
+
+
+def parse(pattern):
+ stack = []
+ ops = []
+
+
+
+
+class RegexBuilder(object):
+ def __init__(self):
+ self.statenum = 1
+
+ def new_state(self):
+ self.statenum += 1
+ return self.statenum
+
+ def epsilon(self):
+ s = self.new_state()
+ e = self.new_state()
+ nfa = NFA(s)
+ nfa.add_transition(s, EPSILON, e)
+ nfa.add_final_state(e)
+ return nfa
+
+ def char(self, label):
+ s = self.new_state()
+ e = self.new_state()
+ nfa = NFA(s)
+ nfa.add_transition(s, label, e)
+ nfa.add_final_state(e)
+ return nfa
+
+ def charset(self, chars):
+ s = self.new_state()
+ e = self.new_state()
+ nfa = NFA(s)
+ for char in chars:
+ nfa.add_transition(s, char, e)
+ nfa.add_final_state(e)
+ return e
+
+ def dot(self):
+ s = self.new_state()
+ e = self.new_state()
+ nfa = NFA(s)
+ nfa.add_transition(s, ANY, e)
+ nfa.add_final_state(e)
+ return nfa
+
+ def choice(self, n1, n2):
+ s = self.new_state()
+ s1 = self.new_state()
+ s2 = self.new_state()
+ e1 = self.new_state()
+ e2 = self.new_state()
+ e = self.new_state()
+ nfa = NFA(s)
+ nfa.add_transition(s, EPSILON, s1)
+ nfa.add_transition(s, EPSILON, s2)
+ nfa.insert(s1, n1, e1)
+ nfa.insert(s2, n2, e2)
+ nfa.add_transition(e1, EPSILON, e)
+ nfa.add_transition(e2, EPSILON, e)
+ nfa.add_final_state(e)
+ return nfa
+
+ def concat(self, n1, n2):
+ s = self.new_state()
+ m = self.new_state()
+ e = self.new_state()
+ nfa = NFA(s)
+ nfa.insert(s, n1, m)
+ nfa.insert(m, n2, e)
+ nfa.add_final_state(e)
+ return nfa
+
+ def star(self, n):
+ s = self.new_state()
+ m1 = self.new_state()
+ m2 = self.new_state()
+ e = self.new_state()
+ nfa = NFA(s)
+ nfa.add_transition(s, EPSILON, m1)
+ nfa.add_transition(s, EPSILON, e)
+ nfa.insert(m1, n, m2)
+ nfa.add_transition(m2, EPSILON, m1)
+ nfa.add_transition(m2, EPSILON, e)
+ nfa.add_final_state(e)
+ return nfa
+
+ def plus(self, n):
+ return self.concat(n, self.star(n))
+
+ def question(self, n):
+ return self.choice(n, self.epsilon())
+
+
+
+
+
diff --git a/src/whoosh/classify.py b/src/whoosh/classify.py
new file mode 100755
index 0000000..628edf5
--- /dev/null
+++ b/src/whoosh/classify.py
@@ -0,0 +1,377 @@
+# Copyright 2008 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+"""Classes and functions for classifying and extracting information from
+documents.
+"""
+
+from __future__ import division
+import random
+from collections import defaultdict
+from math import log
+
+from whoosh.compat import xrange, iteritems
+
+
+# Expansion models
+
+class ExpansionModel(object):
+ def __init__(self, doc_count, field_length):
+ self.N = doc_count
+ self.collection_total = field_length
+
+ if self.N:
+ self.mean_length = self.collection_total / self.N
+ else:
+ self.mean_length = 0
+
+ def normalizer(self, maxweight, top_total):
+ raise NotImplementedError
+
+ def score(self, weight_in_top, weight_in_collection, top_total):
+ raise NotImplementedError
+
+
+class Bo1Model(ExpansionModel):
+ def normalizer(self, maxweight, top_total):
+ f = maxweight / self.N
+ return (maxweight * log((1.0 + f) / f) + log(1.0 + f)) / log(2.0)
+
+ def score(self, weight_in_top, weight_in_collection, top_total):
+ f = weight_in_collection / self.N
+ return weight_in_top * log((1.0 + f) / f, 2) + log(1.0 + f, 2)
+
+
+class Bo2Model(ExpansionModel):
+ def normalizer(self, maxweight, top_total):
+ f = maxweight * self.N / self.collection_total
+ return maxweight * log((1.0 + f) / f, 2) + log(1.0 + f, 2)
+
+ def score(self, weight_in_top, weight_in_collection, top_total):
+ f = weight_in_top * top_total / self.collection_total
+ return weight_in_top * log((1.0 + f) / f, 2) + log(1.0 + f, 2)
+
+
+class KLModel(ExpansionModel):
+ def normalizer(self, maxweight, top_total):
+ return (maxweight * log(self.collection_total / top_total) / log(2.0)
+ * top_total)
+
+ def score(self, weight_in_top, weight_in_collection, top_total):
+ wit_over_tt = weight_in_top / top_total
+ wic_over_ct = weight_in_collection / self.collection_total
+
+ if wit_over_tt < wic_over_ct:
+ return 0
+ else:
+ return wit_over_tt * log(wit_over_tt
+ / (weight_in_top / self.collection_total),
+ 2)
+
+
+class Expander(object):
+ """Uses an ExpansionModel to expand the set of query terms based on the top
+ N result documents.
+ """
+
+ def __init__(self, ixreader, fieldname, model=Bo1Model):
+ """
+ :param reader: A :class:whoosh.reading.IndexReader object.
+ :param fieldname: The name of the field in which to search.
+ :param model: (classify.ExpansionModel) The model to use for expanding
+ the query terms. If you omit this parameter, the expander uses
+ :class:`Bo1Model` by default.
+ """
+
+ self.ixreader = ixreader
+ self.fieldname = fieldname
+ doccount = self.ixreader.doc_count_all()
+ fieldlen = self.ixreader.field_length(fieldname)
+
+ if type(model) is type:
+ model = model(doccount, fieldlen)
+ self.model = model
+
+ # Maps words to their weight in the top N documents.
+ self.topN_weight = defaultdict(float)
+
+ # Total weight of all terms in the top N documents.
+ self.top_total = 0
+
+ def add(self, vector):
+ """Adds forward-index information about one of the "top N" documents.
+
+ :param vector: A series of (text, weight) tuples, such as is
+ returned by Reader.vector_as("weight", docnum, fieldname).
+ """
+
+ total_weight = 0
+ topN_weight = self.topN_weight
+
+ for word, weight in vector:
+ total_weight += weight
+ topN_weight[word] += weight
+
+ self.top_total += total_weight
+
+ def add_document(self, docnum):
+ ixreader = self.ixreader
+ if self.ixreader.has_vector(docnum, self.fieldname):
+ self.add(ixreader.vector_as("weight", docnum, self.fieldname))
+ elif self.ixreader.schema[self.fieldname].stored:
+ self.add_text(ixreader.stored_fields(docnum).get(self.fieldname))
+ else:
+ raise Exception("Field %r in document %s is not vectored or stored"
+ % (self.fieldname, docnum))
+
+ def add_text(self, string):
+ # Unfortunately since field.index() yields bytes texts, and we want
+ # unicode, we end up encoding and decoding unnecessarily.
+ #
+ # TODO: Find a way around this
+
+ field = self.ixreader.schema[self.fieldname]
+ from_bytes = field.from_bytes
+ self.add((from_bytes(text), weight) for text, _, weight, _
+ in field.index(string))
+
+ def expanded_terms(self, number, normalize=True):
+ """Returns the N most important terms in the vectors added so far.
+
+ :param number: The number of terms to return.
+ :param normalize: Whether to normalize the weights.
+ :returns: A list of ("term", weight) tuples.
+ """
+
+ model = self.model
+ fieldname = self.fieldname
+ ixreader = self.ixreader
+ field = ixreader.schema[fieldname]
+ tlist = []
+ maxweight = 0
+
+ # If no terms have been added, return an empty list
+ if not self.topN_weight:
+ return []
+
+ for word, weight in iteritems(self.topN_weight):
+ btext = field.to_bytes(word)
+ if (fieldname, btext) in ixreader:
+ cf = ixreader.frequency(fieldname, btext)
+ score = model.score(weight, cf, self.top_total)
+ if score > maxweight:
+ maxweight = score
+ tlist.append((score, word))
+
+ if normalize:
+ norm = model.normalizer(maxweight, self.top_total)
+ else:
+ norm = maxweight
+ tlist = [(weight / norm, t) for weight, t in tlist]
+ tlist.sort(key=lambda x: (0 - x[0], x[1]))
+
+ return [(t, weight) for weight, t in tlist[:number]]
+
+
+# Similarity functions
+
+def shingles(input, size=2):
+ d = defaultdict(int)
+ for shingle in (input[i:i + size]
+ for i in xrange(len(input) - (size - 1))):
+ d[shingle] += 1
+ return iteritems(d)
+
+
+def simhash(features, hashbits=32):
+ if hashbits == 32:
+ hashfn = hash
+ else:
+ hashfn = lambda s: _hash(s, hashbits)
+
+ vs = [0] * hashbits
+ for feature, weight in features:
+ h = hashfn(feature)
+ for i in xrange(hashbits):
+ if h & (1 << i):
+ vs[i] += weight
+ else:
+ vs[i] -= weight
+
+ out = 0
+ for i, v in enumerate(vs):
+ if v > 0:
+ out |= 1 << i
+ return out
+
+
+def _hash(s, hashbits):
+ # A variable-length version of Python's builtin hash
+ if s == "":
+ return 0
+ else:
+ x = ord(s[0]) << 7
+ m = 1000003
+ mask = 2 ** hashbits - 1
+ for c in s:
+ x = ((x * m) ^ ord(c)) & mask
+ x ^= len(s)
+ if x == -1:
+ x = -2
+ return x
+
+
+def hamming_distance(first_hash, other_hash, hashbits=32):
+ x = (first_hash ^ other_hash) & ((1 << hashbits) - 1)
+ tot = 0
+ while x:
+ tot += 1
+ x &= x - 1
+ return tot
+
+
+# Clustering
+
+def kmeans(data, k, t=0.0001, distfun=None, maxiter=50, centers=None):
+ """
+ One-dimensional K-means clustering function.
+
+ :param data: list of data points.
+ :param k: number of clusters.
+ :param t: tolerance; stop if changes between iterations are smaller than
+ this value.
+ :param distfun: a distance function.
+ :param centers: a list of centroids to start with.
+ :param maxiter: maximum number of iterations to run.
+ """
+
+ # Adapted from a C version by Roger Zhang, <rogerz@cs.dal.ca>
+ # http://cs.smu.ca/~r_zhang/code/kmeans.c
+
+ DOUBLE_MAX = 1.797693e308
+ n = len(data)
+
+ error = DOUBLE_MAX # sum of squared euclidean distance
+
+ counts = [0] * k # size of each cluster
+ labels = [0] * n # output cluster label for each data point
+
+ # c1 is an array of len k of the temp centroids
+ c1 = [0] * k
+
+ # choose k initial centroids
+ if centers:
+ c = centers
+ else:
+ c = random.sample(data, k)
+
+ niter = 0
+ # main loop
+ while True:
+ # save error from last step
+ old_error = error
+ error = 0
+
+ # clear old counts and temp centroids
+ for i in xrange(k):
+ counts[i] = 0
+ c1[i] = 0
+
+ for h in xrange(n):
+ # identify the closest cluster
+ min_distance = DOUBLE_MAX
+ for i in xrange(k):
+ distance = (data[h] - c[i]) ** 2
+ if distance < min_distance:
+ labels[h] = i
+ min_distance = distance
+
+ # update size and temp centroid of the destination cluster
+ c1[labels[h]] += data[h]
+ counts[labels[h]] += 1
+ # update standard error
+ error += min_distance
+
+ for i in xrange(k): # update all centroids
+ c[i] = c1[i] / counts[i] if counts[i] else c1[i]
+
+ niter += 1
+ if (abs(error - old_error) < t) or (niter > maxiter):
+ break
+
+ return labels, c
+
+
+# Sliding window clusters
+
+def two_pass_variance(data):
+ n = 0
+ sum1 = 0
+ sum2 = 0
+
+ for x in data:
+ n += 1
+ sum1 = sum1 + x
+
+ mean = sum1 / n
+
+ for x in data:
+ sum2 += (x - mean) * (x - mean)
+
+ variance = sum2 / (n - 1)
+ return variance
+
+
+def weighted_incremental_variance(data_weight_pairs):
+ mean = 0
+ S = 0
+ sumweight = 0
+ for x, weight in data_weight_pairs:
+ temp = weight + sumweight
+ Q = x - mean
+ R = Q * weight / temp
+ S += sumweight * Q * R
+ mean += R
+ sumweight = temp
+ Variance = S / (sumweight - 1) # if sample is the population, omit -1
+ return Variance
+
+
+def swin(data, size):
+ clusters = []
+ for i, left in enumerate(data):
+ j = i
+ right = data[j]
+ while j < len(data) - 1 and right - left < size:
+ j += 1
+ right = data[j]
+ v = 99999
+ if j - i > 1:
+ v = two_pass_variance(data[i:j + 1])
+ clusters.append((left, right, j - i, v))
+ clusters.sort(key=lambda x: (0 - x[2], x[3]))
+ return clusters
diff --git a/src/whoosh/codec/__init__.py b/src/whoosh/codec/__init__.py
new file mode 100644
index 0000000..7044563
--- /dev/null
+++ b/src/whoosh/codec/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2012 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+
+def default_codec(*args, **kwargs):
+ from whoosh.codec.whoosh3 import W3Codec
+
+ return W3Codec(*args, **kwargs)
diff --git a/src/whoosh/codec/base.py b/src/whoosh/codec/base.py
new file mode 100644
index 0000000..159a978
--- /dev/null
+++ b/src/whoosh/codec/base.py
@@ -0,0 +1,843 @@
+# Copyright 2011 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+"""
+This module contains base classes/interfaces for "codec" objects.
+"""
+
+from bisect import bisect_right
+
+from whoosh import columns
+from whoosh.automata import lev
+from whoosh.compat import abstractmethod, izip, unichr, xrange
+from whoosh.filedb.compound import CompoundStorage
+from whoosh.system import emptybytes
+from whoosh.util import random_name
+
+
+# Exceptions
+
+class OutOfOrderError(Exception):
+ pass
+
+
+# Base classes
+
+class Codec(object):
+ length_stats = True
+
+ # Per document value writer
+
+ @abstractmethod
+ def per_document_writer(self, storage, segment):
+ raise NotImplementedError
+
+ # Inverted index writer
+
+ @abstractmethod
+ def field_writer(self, storage, segment):
+ raise NotImplementedError
+
+ # Postings
+
+ @abstractmethod
+ def postings_writer(self, dbfile, byteids=False):
+ raise NotImplementedError
+
+ @abstractmethod
+ def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None):
+ raise NotImplementedError
+
+ # Index readers
+
+ def automata(self, storage, segment):
+ return Automata()
+
+ @abstractmethod
+ def terms_reader(self, storage, segment):
+ raise NotImplementedError
+
+ @abstractmethod
+ def per_document_reader(self, storage, segment):
+ raise NotImplementedError
+
+ # Segments and generations
+
+ @abstractmethod
+ def new_segment(self, storage, indexname):
+ raise NotImplementedError
+
+
+class WrappingCodec(Codec):
+ def __init__(self, child):
+ self._child = child
+
+ def per_document_writer(self, storage, segment):
+ return self._child.per_document_writer(storage, segment)
+
+ def field_writer(self, storage, segment):
+ return self._child.field_writer(storage, segment)
+
+ def postings_writer(self, dbfile, byteids=False):
+ return self._child.postings_writer(dbfile, byteids=byteids)
+
+ def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None):
+ return self._child.postings_reader(dbfile, terminfo, format_, term=term,
+ scorer=scorer)
+
+ def automata(self, storage, segment):
+ return self._child.automata(storage, segment)
+
+ def terms_reader(self, storage, segment):
+ return self._child.terms_reader(storage, segment)
+
+ def per_document_reader(self, storage, segment):
+ return self._child.per_document_reader(storage, segment)
+
+ def new_segment(self, storage, indexname):
+ return self._child.new_segment(storage, indexname)
+
+
+# Writer classes
+
+class PerDocumentWriter(object):
+ @abstractmethod
+ def start_doc(self, docnum):
+ raise NotImplementedError
+
+ @abstractmethod
+ def add_field(self, fieldname, fieldobj, value, length):
+ raise NotImplementedError
+
+ @abstractmethod
+ def add_column_value(self, fieldname, columnobj, value):
+ raise NotImplementedError("Codec does not implement writing columns")
+
+ @abstractmethod
+ def add_vector_items(self, fieldname, fieldobj, items):
+ raise NotImplementedError
+
+ def add_vector_matcher(self, fieldname, fieldobj, vmatcher):
+ def readitems():
+ while vmatcher.is_active():
+ text = vmatcher.id()
+ weight = vmatcher.weight()
+ valuestring = vmatcher.value()
+ yield (text, weight, valuestring)
+ vmatcher.next()
+ self.add_vector_items(fieldname, fieldobj, readitems())
+
+ def finish_doc(self):
+ pass
+
+ def close(self):
+ pass
+
+
+class FieldWriter(object):
+ def add_postings(self, schema, lengths, items):
+ # This method translates a generator of (fieldname, btext, docnum, w, v)
+ # postings into calls to start_field(), start_term(), add(),
+ # finish_term(), finish_field(), etc.
+
+ start_field = self.start_field
+ start_term = self.start_term
+ add = self.add
+ finish_term = self.finish_term
+ finish_field = self.finish_field
+
+ if lengths:
+ dfl = lengths.doc_field_length
+ else:
+ dfl = lambda docnum, fieldname: 0
+
+ # The fieldname of the previous posting
+ lastfn = None
+ # The bytes text of the previous posting
+ lasttext = None
+ # The (fieldname, btext) of the previous spelling posting
+ lastspell = None
+ # The field object for the current field
+ fieldobj = None
+ for fieldname, btext, docnum, weight, value in items:
+ # Check for out-of-order postings. This is convoluted because Python
+ # 3 removed the ability to compare a string to None
+ if lastfn is not None and fieldname < lastfn:
+ raise OutOfOrderError("Field %r .. %r" % (lastfn, fieldname))
+ if fieldname == lastfn and lasttext and btext < lasttext:
+ raise OutOfOrderError("Term %s:%r .. %s:%r"
+ % (lastfn, lasttext, fieldname, btext))
+
+ # If the fieldname of this posting is different from the last one,
+ # tell the writer we're starting a new field
+ if fieldname != lastfn:
+ if lasttext is not None:
+ finish_term()
+ if lastfn is not None and fieldname != lastfn:
+ finish_field()
+ fieldobj = schema[fieldname]
+ start_field(fieldname, fieldobj)
+ lastfn = fieldname
+ lasttext = None
+
+ # HACK: items where docnum == -1 indicate words that should be added
+ # to the spelling graph, not the postings
+ if docnum == -1:
+ # spellterm = (fieldname, btext)
+ # # There can be duplicates of spelling terms, so only add a spell
+ # # term if it's greater than the last one
+ # if lastspell is None or spellterm > lastspell:
+ # spellword = fieldobj.from_bytes(btext)
+ # self.add_spell_word(fieldname, spellword)
+ # lastspell = spellterm
+ continue
+
+ # If this term is different from the term in the previous posting,
+ # tell the writer to start a new term
+ if btext != lasttext:
+ if lasttext is not None:
+ finish_term()
+ start_term(btext)
+ lasttext = btext
+
+ # Add this posting
+ length = dfl(docnum, fieldname)
+ if value is None:
+ value = emptybytes
+ add(docnum, weight, value, length)
+
+ if lasttext is not None:
+ finish_term()
+ if lastfn is not None:
+ finish_field()
+
+ @abstractmethod
+ def start_field(self, fieldname, fieldobj):
+ raise NotImplementedError
+
+ @abstractmethod
+ def start_term(self, text):
+ raise NotImplementedError
+
+ @abstractmethod
+ def add(self, docnum, weight, vbytes, length):
+ raise NotImplementedError
+
+ def add_spell_word(self, fieldname, text):
+ raise NotImplementedError
+
+ @abstractmethod
+ def finish_term(self):
+ raise NotImplementedError
+
+ def finish_field(self):
+ pass
+
+ def close(self):
+ pass
+
+
+# Postings
+
+class PostingsWriter(object):
+ @abstractmethod
+ def start_postings(self, format_, terminfo):
+ raise NotImplementedError
+
+ @abstractmethod
+ def add_posting(self, id_, weight, vbytes, length=None):
+ raise NotImplementedError
+
+ def finish_postings(self):
+ pass
+
+ @abstractmethod
+ def written(self):
+ """Returns True if this object has already written to disk.
+ """
+
+ raise NotImplementedError
+
+
+# Reader classes
+
+class FieldCursor(object):
+ def first(self):
+ raise NotImplementedError
+
+ def find(self, string):
+ raise NotImplementedError
+
+ def next(self):
+ raise NotImplementedError
+
+ def term(self):
+ raise NotImplementedError
+
+
+class TermsReader(object):
+ @abstractmethod
+ def __contains__(self, term):
+ raise NotImplementedError
+
+ @abstractmethod
+ def cursor(self, fieldname, fieldobj):
+ raise NotImplementedError
+
+ @abstractmethod
+ def terms(self):
+ raise NotImplementedError
+
+ @abstractmethod
+ def terms_from(self, fieldname, prefix):
+ raise NotImplementedError
+
+ @abstractmethod
+ def items(self):
+ raise NotImplementedError
+
+ @abstractmethod
+ def items_from(self, fieldname, prefix):
+ raise NotImplementedError
+
+ @abstractmethod
+ def term_info(self, fieldname, text):
+ raise NotImplementedError
+
+ @abstractmethod
+ def frequency(self, fieldname, text):
+ return self.term_info(fieldname, text).weight()
+
+ @abstractmethod
+ def doc_frequency(self, fieldname, text):
+ return self.term_info(fieldname, text).doc_frequency()
+
+ @abstractmethod
+ def matcher(self, fieldname, text, format_, scorer=None):
+ raise NotImplementedError
+
+ @abstractmethod
+ def indexed_field_names(self):
+ raise NotImplementedError
+
+ def close(self):
+ pass
+
+
+class Automata(object):
+ @staticmethod
+ def levenshtein_dfa(uterm, maxdist, prefix=0):
+ return lev.levenshtein_automaton(uterm, maxdist, prefix).to_dfa()
+
+ @staticmethod
+ def find_matches(dfa, cur):
+ unull = unichr(0)
+
+ term = cur.text()
+ if term is None:
+ return
+
+ match = dfa.next_valid_string(term)
+ while match:
+ cur.find(match)
+ term = cur.text()
+ if term is None:
+ return
+ if match == term:
+ yield match
+ term += unull
+ match = dfa.next_valid_string(term)
+
+ def terms_within(self, fieldcur, uterm, maxdist, prefix=0):
+ dfa = self.levenshtein_dfa(uterm, maxdist, prefix)
+ return self.find_matches(dfa, fieldcur)
+
+
+# Per-doc value reader
+
+class PerDocumentReader(object):
+ def close(self):
+ pass
+
+ @abstractmethod
+ def doc_count(self):
+ raise NotImplementedError
+
+ @abstractmethod
+ def doc_count_all(self):
+ raise NotImplementedError
+
+ # Deletions
+
+ @abstractmethod
+ def has_deletions(self):
+ raise NotImplementedError
+
+ @abstractmethod
+ def is_deleted(self, docnum):
+ raise NotImplementedError
+
+ @abstractmethod
+ def deleted_docs(self):
+ raise NotImplementedError
+
+ def all_doc_ids(self):
+ """
+ Returns an iterator of all (undeleted) document IDs in the reader.
+ """
+
+ is_deleted = self.is_deleted
+ return (docnum for docnum in xrange(self.doc_count_all())
+ if not is_deleted(docnum))
+
+ def iter_docs(self):
+ for docnum in self.all_doc_ids():
+ yield docnum, self.stored_fields(docnum)
+
+ # Columns
+
+ def supports_columns(self):
+ return False
+
+ def has_column(self, fieldname):
+ return False
+
+ def list_columns(self):
+ raise NotImplementedError
+
+ # Don't need to override this if supports_columns() returns False
+ def column_reader(self, fieldname, column):
+ raise NotImplementedError
+
+ # Bitmaps
+
+ def field_docs(self, fieldname):
+ return None
+
+ # Lengths
+
+ @abstractmethod
+ def doc_field_length(self, docnum, fieldname, default=0):
+ raise NotImplementedError
+
+ @abstractmethod
+ def field_length(self, fieldname):
+ raise NotImplementedError
+
+ @abstractmethod
+ def min_field_length(self, fieldname):
+ raise NotImplementedError
+
+ @abstractmethod
+ def max_field_length(self, fieldname):
+ raise NotImplementedError
+
+ # Vectors
+
+ def has_vector(self, docnum, fieldname):
+ return False
+
+ # Don't need to override this if has_vector() always returns False
+ def vector(self, docnum, fieldname, format_):
+ raise NotImplementedError
+
+ # Stored
+
+ @abstractmethod
+ def stored_fields(self, docnum):
+ raise NotImplementedError
+
+ def all_stored_fields(self):
+ for docnum in self.all_doc_ids():
+ yield self.stored_fields(docnum)
+
+
+# Segment base class
+
+class Segment(object):
+ """Do not instantiate this object directly. It is used by the Index object
+ to hold information about a segment. A list of objects of this class are
+ pickled as part of the TOC file.
+
+ The TOC file stores a minimal amount of information -- mostly a list of
+ Segment objects. Segments are the real reverse indexes. Having multiple
+ segments allows quick incremental indexing: just create a new segment for
+ the new documents, and have the index overlay the new segment over previous
+ ones for purposes of reading/search. "Optimizing" the index combines the
+ contents of existing segments into one (removing any deleted documents
+ along the way).
+ """
+
+ # Extension for compound segment files
+ COMPOUND_EXT = ".seg"
+
+ # self.indexname
+ # self.segid
+
+ def __init__(self, indexname):
+ self.indexname = indexname
+ self.segid = self._random_id()
+ self.compound = False
+
+ @classmethod
+ def _random_id(cls, size=16):
+ return random_name(size=size)
+
+ def __repr__(self):
+ return "<%s %s>" % (self.__class__.__name__, self.segment_id())
+
+ def codec(self):
+ raise NotImplementedError
+
+ def index_name(self):
+ return self.indexname
+
+ def segment_id(self):
+ if hasattr(self, "name"):
+ # Old segment class
+ return self.name
+ else:
+ return "%s_%s" % (self.index_name(), self.segid)
+
+ def is_compound(self):
+ if not hasattr(self, "compound"):
+ return False
+ return self.compound
+
+ # File convenience methods
+
+ def make_filename(self, ext):
+ return "%s%s" % (self.segment_id(), ext)
+
+ def list_files(self, storage):
+ prefix = "%s." % self.segment_id()
+ return [name for name in storage.list() if name.startswith(prefix)]
+
+ def create_file(self, storage, ext, **kwargs):
+ """Convenience method to create a new file in the given storage named
+ with this segment's ID and the given extension. Any keyword arguments
+ are passed to the storage's create_file method.
+ """
+
+ fname = self.make_filename(ext)
+ return storage.create_file(fname, **kwargs)
+
+ def open_file(self, storage, ext, **kwargs):
+ """Convenience method to open a file in the given storage named with
+ this segment's ID and the given extension. Any keyword arguments are
+ passed to the storage's open_file method.
+ """
+
+ fname = self.make_filename(ext)
+ return storage.open_file(fname, **kwargs)
+
+ def create_compound_file(self, storage):
+ segfiles = self.list_files(storage)
+ assert not any(name.endswith(self.COMPOUND_EXT) for name in segfiles)
+ cfile = self.create_file(storage, self.COMPOUND_EXT)
+ CompoundStorage.assemble(cfile, storage, segfiles)
+ for name in segfiles:
+ storage.delete_file(name)
+ self.compound = True
+
+ def open_compound_file(self, storage):
+ name = self.make_filename(self.COMPOUND_EXT)
+ dbfile = storage.open_file(name)
+ return CompoundStorage(dbfile, use_mmap=storage.supports_mmap)
+
+ # Abstract methods
+
+ @abstractmethod
+ def doc_count_all(self):
+ """
+ Returns the total number of documents, DELETED OR UNDELETED, in this
+ segment.
+ """
+
+ raise NotImplementedError
+
+ def doc_count(self):
+ """
+ Returns the number of (undeleted) documents in this segment.
+ """
+
+ return self.doc_count_all() - self.deleted_count()
+
+ def set_doc_count(self, doccount):
+ raise NotImplementedError
+
+ def has_deletions(self):
+ """
+ Returns True if any documents in this segment are deleted.
+ """
+
+ return self.deleted_count() > 0
+
+ @abstractmethod
+ def deleted_count(self):
+ """
+ Returns the total number of deleted documents in this segment.
+ """
+
+ raise NotImplementedError
+
+ @abstractmethod
+ def deleted_docs(self):
+ raise NotImplementedError
+
+ @abstractmethod
+ def delete_document(self, docnum, delete=True):
+ """Deletes the given document number. The document is not actually
+ removed from the index until it is optimized.
+
+ :param docnum: The document number to delete.
+ :param delete: If False, this undeletes a deleted document.
+ """
+
+ raise NotImplementedError
+
+ @abstractmethod
+ def is_deleted(self, docnum):
+ """
+ Returns True if the given document number is deleted.
+ """
+
+ raise NotImplementedError
+
+ def should_assemble(self):
+ return True
+
+
+# Wrapping Segment
+
+class WrappingSegment(Segment):
+ def __init__(self, child):
+ self._child = child
+
+ def codec(self):
+ return self._child.codec()
+
+ def index_name(self):
+ return self._child.index_name()
+
+ def segment_id(self):
+ return self._child.segment_id()
+
+ def is_compound(self):
+ return self._child.is_compound()
+
+ def should_assemble(self):
+ return self._child.should_assemble()
+
+ def make_filename(self, ext):
+ return self._child.make_filename(ext)
+
+ def list_files(self, storage):
+ return self._child.list_files(storage)
+
+ def create_file(self, storage, ext, **kwargs):
+ return self._child.create_file(storage, ext, **kwargs)
+
+ def open_file(self, storage, ext, **kwargs):
+ return self._child.open_file(storage, ext, **kwargs)
+
+ def create_compound_file(self, storage):
+ return self._child.create_compound_file(storage)
+
+ def open_compound_file(self, storage):
+ return self._child.open_compound_file(storage)
+
+ def delete_document(self, docnum, delete=True):
+ return self._child.delete_document(docnum, delete=delete)
+
+ def has_deletions(self):
+ return self._child.has_deletions()
+
+ def deleted_count(self):
+ return self._child.deleted_count()
+
+ def deleted_docs(self):
+ return self._child.deleted_docs()
+
+ def is_deleted(self, docnum):
+ return self._child.is_deleted(docnum)
+
+ def set_doc_count(self, doccount):
+ self._child.set_doc_count(doccount)
+
+ def doc_count(self):
+ return self._child.doc_count()
+
+ def doc_count_all(self):
+ return self._child.doc_count_all()
+
+
+# Multi per doc reader
+
+class MultiPerDocumentReader(PerDocumentReader):
+ def __init__(self, readers, offset=0):
+ self._readers = readers
+
+ self._doc_offsets = []
+ self._doccount = 0
+ for pdr in readers:
+ self._doc_offsets.append(self._doccount)
+ self._doccount += pdr.doc_count_all()
+
+ self.is_closed = False
+
+ def close(self):
+ for r in self._readers:
+ r.close()
+ self.is_closed = True
+
+ def doc_count_all(self):
+ return self._doccount
+
+ def doc_count(self):
+ total = 0
+ for r in self._readers:
+ total += r.doc_count()
+ return total
+
+ def _document_reader(self, docnum):
+ return max(0, bisect_right(self._doc_offsets, docnum) - 1)
+
+ def _reader_and_docnum(self, docnum):
+ rnum = self._document_reader(docnum)
+ offset = self._doc_offsets[rnum]
+ return rnum, docnum - offset
+
+ # Deletions
+
+ def has_deletions(self):
+ return any(r.has_deletions() for r in self._readers)
+
+ def is_deleted(self, docnum):
+ x, y = self._reader_and_docnum(docnum)
+ return self._readers[x].is_deleted(y)
+
+ def deleted_docs(self):
+ for r, offset in izip(self._readers, self._doc_offsets):
+ for docnum in r.deleted_docs():
+ yield docnum + offset
+
+ def all_doc_ids(self):
+ for r, offset in izip(self._readers, self._doc_offsets):
+ for docnum in r.all_doc_ids():
+ yield docnum + offset
+
+ # Columns
+
+ def has_column(self, fieldname):
+ return any(r.has_column(fieldname) for r in self._readers)
+
+ def column_reader(self, fieldname, column):
+ if not self.has_column(fieldname):
+ raise ValueError("No column %r" % (fieldname,))
+
+ default = column.default_value()
+ colreaders = []
+ for r in self._readers:
+ if r.has_column(fieldname):
+ cr = r.column_reader(fieldname, column)
+ else:
+ cr = columns.EmptyColumnReader(default, r.doc_count_all())
+ colreaders.append(cr)
+
+ if len(colreaders) == 1:
+ return colreaders[0]
+ else:
+ return columns.MultiColumnReader(colreaders)
+
+ # Lengths
+
+ def doc_field_length(self, docnum, fieldname, default=0):
+ x, y = self._reader_and_docnum(docnum)
+ return self._readers[x].doc_field_length(y, fieldname, default)
+
+ def field_length(self, fieldname):
+ total = 0
+ for r in self._readers:
+ total += r.field_length(fieldname)
+ return total
+
+ def min_field_length(self):
+ return min(r.min_field_length() for r in self._readers)
+
+ def max_field_length(self):
+ return max(r.max_field_length() for r in self._readers)
+
+
+# Extended base classes
+
+class PerDocWriterWithColumns(PerDocumentWriter):
+ def __init__(self):
+ PerDocumentWriter.__init__(self)
+ # Implementations need to set these attributes
+ self._storage = None
+ self._segment = None
+ self._docnum = None
+
+ @abstractmethod
+ def _has_column(self, fieldname):
+ raise NotImplementedError
+
+ @abstractmethod
+ def _create_column(self, fieldname, column):
+ raise NotImplementedError
+
+ @abstractmethod
+ def _get_column(self, fieldname):
+ raise NotImplementedError
+
+ def add_column_value(self, fieldname, column, value):
+ if not self._has_column(fieldname):
+ self._create_column(fieldname, column)
+ self._get_column(fieldname).add(self._docnum, value)
+
+
+# FieldCursor implementations
+
+class EmptyCursor(FieldCursor):
+ def first(self):
+ return None
+
+ def find(self, term):
+ return None
+
+ def next(self):
+ return None
+
+ def text(self):
+ return None
+
+ def term_info(self):
+ return None
+
+ def is_valid(self):
+ return False
diff --git a/src/whoosh/codec/memory.py b/src/whoosh/codec/memory.py
new file mode 100644
index 0000000..5a5babe
--- /dev/null
+++ b/src/whoosh/codec/memory.py
@@ -0,0 +1,334 @@
+# Copyright 2012 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are
+# those of the authors and should not be interpreted as representing official
+# policies, either expressed or implied, of Matt Chaput.
+
+from __future__ import with_statement
+from bisect import bisect_left
+from threading import Lock, RLock
+
+from whoosh.compat import xrange
+from whoosh.codec import base
+from whoosh.matching import ListMatcher
+from whoosh.reading import SegmentReader, TermInfo, TermNotFound
+from whoosh.writing import SegmentWriter
+
+
+class MemWriter(SegmentWriter):
+ def commit(self):
+ self._finalize_segment()
+
+
+class MemoryCodec(base.Codec):
+ def __init__(self):
+ from whoosh.filedb.filestore import RamStorage
+
+ self.storage = RamStorage()
+ self.segment = MemSegment(self, "blah")
+
+ def writer(self, schema):
+ ix = self.storage.create_index(schema)
+ return MemWriter(ix, _lk=False, codec=self,
+ docbase=self.segment._doccount)
+
+ def reader(self, schema):
+ return SegmentReader(self.storage, schema, self.segment, codec=self)
+
+ def per_document_writer(self, storage, segment):
+ return MemPerDocWriter(self.storage, self.segment)
+
+ def field_writer(self, storage, segment):
+ return MemFieldWriter(self.storage, self.segment)
+
+ def per_document_reader(self, storage, segment):
+ return MemPerDocReader(self.storage, self.segment)
+
+ def terms_reader(self, storage, segment):
+ return MemTermsReader(self.storage, self.segment)
+
+ def new_segment(self, storage, indexname):
+ return self.segment
+
+
+class MemPerDocWriter(base.PerDocWriterWithColumns):
+ def __init__(self, storage, segment):
+ self._storage = storage
+ self._segment = segment
+ self.is_closed = False
+ self._colwriters = {}
+ self._doccount = 0
+
+ def _has_column(self, fieldname):
+ return fieldname in self._colwriters
+
+ def _create_column(self, fieldname, column):
+ colfile = self._storage.create_file("%s.c" % fieldname)
+ self._colwriters[fieldname] = (colfile, column.writer(colfile))
+
+ def _get_column(self, fieldname):
+ return self._colwriters[fieldname][1]
+
+ def start_doc(self, docnum):
+ self._doccount += 1
+ self._docnum = docnum
+ self._stored = {}
+ self._lengths = {}
+ self._vectors = {}
+
+ def add_field(self, fieldname, fieldobj, value, length):
+ if value is not None:
+ self._stored[fieldname] = value
+ if length is not None:
+ self._lengths[fieldname] = length
+
+ def add_vector_items(self, fieldname, fieldobj, items):
+ self._vectors[fieldname] = tuple(items)
+
+ def finish_doc(self):
+ with self._segment._lock:
+ docnum = self._docnum
+ self._segment._stored[docnum] = self._stored
+ self._segment._lengths[docnum] = self._lengths
+ self._segment._vectors[docnum] = self._vectors
+
+ def close(self):
+ colwriters = self._colwriters
+ for fieldname in colwriters:
+ colfile, colwriter = colwriters[fieldname]
+ colwriter.finish(self._doccount)
+ colfile.close()
+ self.is_closed = True
+
+
+class MemPerDocReader(base.PerDocumentReader):
+ def __init__(self, storage, segment):
+ self._storage = storage
+ self._segment = segment
+
+ def doc_count(self):
+ return self._segment.doc_count()
+
+ def doc_count_all(self):
+ return self._segment.doc_count_all()
+
+ def has_deletions(self):
+ return self._segment.has_deletions()
+
+ def is_deleted(self, docnum):
+ return self._segment.is_deleted(docnum)
+
+ def deleted_docs(self):
+ return self._segment.deleted_docs()
+
+ def supports_columns(self):
+ return True
+
+ def has_column(self, fieldname):
+ filename = "%s.c" % fieldname
+ return self._storage.file_exists(filename)
+
+ def column_reader(self, fieldname, column):
+ filename = "%s.c" % fieldname
+ colfile = self._storage.open_file(filename)
+ length = self._storage.file_length(filename)
+ return column.reader(colfile, 0, length, self._segment.doc_count_all())
+
+ def doc_field_length(self, docnum, fieldname, default=0):
+ return self._segment._lengths[docnum].get(fieldname, default)
+
+ def field_length(self, fieldname):
+ return sum(lens.get(fieldname, 0) for lens
+ in self._segment._lengths.values())
+
+ def min_field_length(self, fieldname):
+ return min(lens[fieldname] for lens in self._segment._lengths.values()
+ if fieldname in lens)
+
+ def max_field_length(self, fieldname):
+ return max(lens[fieldname] for lens in self._segment._lengths.values()
+ if fieldname in lens)
+
+ def has_vector(self, docnum, fieldname):
+ return (docnum in self._segment._vectors
+ and fieldname in self._segment._vectors[docnum])
+
+ def vector(self, docnum, fieldname, format_):
+ items = self._segment._vectors[docnum][fieldname]
+ ids, weights, values = zip(*items)
+ return ListMatcher(ids, weights, values, format_)
+
+ def stored_fields(self, docnum):
+ return self._segment._stored[docnum]
+
+ def close(self):
+ pass
+
+
+class MemFieldWriter(base.FieldWriter):
+ def __init__(self, storage, segment):
+ self._storage = storage
+ self._segment = segment
+ self._fieldname = None
+ self._btext = None
+ self.is_closed = False
+
+ def start_field(self, fieldname, fieldobj):
+ if self._fieldname is not None:
+ raise Exception("Called start_field in a field")
+
+ with self._segment._lock:
+ invindex = self._segment._invindex
+ if fieldname not in invindex:
+ invindex[fieldname] = {}
+
+ self._fieldname = fieldname
+ self._fieldobj = fieldobj
+
+ def start_term(self, btext):
+ if self._btext is not None:
+ raise Exception("Called start_term in a term")
+ fieldname = self._fieldname
+
+ fielddict = self._segment._invindex[fieldname]
+ terminfos = self._segment._terminfos
+ with self._segment._lock:
+ if btext not in fielddict:
+ fielddict[btext] = []
+
+ if (fieldname, btext) not in terminfos:
+ terminfos[fieldname, btext] = TermInfo()
+
+ self._postings = fielddict[btext]
+ self._terminfo = terminfos[fieldname, btext]
+ self._btext = btext
+
+ def add(self, docnum, weight, vbytes, length):
+ self._postings.append((docnum, weight, vbytes))
+ self._terminfo.add_posting(docnum, weight, length)
+
+ def finish_term(self):
+ if self._btext is None:
+ raise Exception("Called finish_term outside a term")
+
+ self._postings = None
+ self._btext = None
+ self._terminfo = None
+
+ def finish_field(self):
+ if self._fieldname is None:
+ raise Exception("Called finish_field outside a field")
+ self._fieldname = None
+ self._fieldobj = None
+
+ def close(self):
+ self.is_closed = True
+
+
+class MemTermsReader(base.TermsReader):
+ def __init__(self, storage, segment):
+ self._storage = storage
+ self._segment = segment
+ self._invindex = segment._invindex
+
+ def __contains__(self, term):
+ return term in self._segment._terminfos
+
+ def terms(self):
+ for fieldname in self._invindex:
+ for btext in self._invindex[fieldname]:
+ yield (fieldname, btext)
+
+ def terms_from(self, fieldname, prefix):
+ if fieldname not in self._invindex:
+ raise TermNotFound("Unknown field %r" % (fieldname,))
+ terms = sorted(self._invindex[fieldname])
+ if not terms:
+ return
+ start = bisect_left(terms, prefix)
+ for i in xrange(start, len(terms)):
+ yield (fieldname, terms[i])
+
+ def term_info(self, fieldname, text):
+ return self._segment._terminfos[fieldname, text]
+
+ def matcher(self, fieldname, btext, format_, scorer=None):
+ items = self._invindex[fieldname][btext]
+ ids, weights, values = zip(*items)
+ return ListMatcher(ids, weights, values, format_, scorer=scorer)
+
+ def indexed_field_names(self):
+ return self._invindex.keys()
+
+ def close(self):
+ pass
+
+
+class MemSegment(base.Segment):
+ def __init__(self, codec, indexname):
+ base.Segment.__init__(self, indexname)
+ self._codec = codec
+ self._doccount = 0
+ self._stored = {}
+ self._lengths = {}
+ self._vectors = {}
+ self._invindex = {}
+ self._terminfos = {}
+ self._lock = Lock()
+
+ def codec(self):
+ return self._codec
+
+ def set_doc_count(self, doccount):
+ self._doccount = doccount
+
+ def doc_count(self):
+ return len(self._stored)
+
+ def doc_count_all(self):
+ return self._doccount
+
+ def delete_document(self, docnum, delete=True):
+ if not delete:
+ raise Exception("MemoryCodec can't undelete")
+ with self._lock:
+ del self._stored[docnum]
+ del self._lengths[docnum]
+ del self._vectors[docnum]
+
+ def has_deletions(self):
+ with self._lock:
+ return self._doccount - len(self._stored)
+
+ def is_deleted(self, docnum):
+ return docnum not in self._stored
+
+ def deleted_docs(self):
+ stored = self._stored
+ for docnum in xrange(self.doc_count_all()):
+ if docnum not in stored:
+ yield docnum
+
+ def should_assemble(self):
+ return False
diff --git a/src/whoosh/codec/plaintext.py b/src/whoosh/codec/plaintext.py
new file mode 100644
index 0000000..fda91cc
--- /dev/null
+++ b/src/whoosh/codec/plaintext.py
@@ -0,0 +1,452 @@
+# Copyright 2012 Matt Chaput. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROF