[notmuch] Patch for xapian defect #250
Kan-Ru Chen
kanru at kanru.info
Wed Dec 9 23:00:42 PST 2009
The termlist is already sorted, so this is the patch trying to minimize
the modification of database as suggested in the comment and Carl's
TODO file.
My poor profiling shows not much, but some improvement.
*Before*
% time notmuch tag +test id:hfntnu+gotv at eGroups.com
MOD_PLISTS: 368
notmuch tag +test id:hfntnu+gotv at eGroups.com 0.05s user 0.03s system 11% cpu 0.673 total
% time notmuch tag -test id:hfntnu+gotv at eGroups.com
MOD_PLISTS: 368
notmuch tag -test id:hfntnu+gotv at eGroups.com 0.06s user 0.01s system 10% cpu 0.681 total
*After*
% time notmuch tag +test id:hfntnu+gotv at eGroups.com
MOD_PLIST: 1
notmuch tag +test id:hfntnu+gotv at eGroups.com 0.01s user 0.02s system 6% cpu 0.436 total
% time notmuch tag -test id:hfntnu+gotv at eGroups.com
MOD_PLIST: 1
notmuch tag -test id:hfntnu+gotv at eGroups.com 0.01s user 0.01s system 5% cpu 0.383 total
% time notmuch tag +test tag:notmuch
notmuch tag +test tag:notmuch 1.71s user 0.03s system 65% cpu 2.632 total
% time notmuch tag -test tag:notmuch
notmuch tag -test tag:notmuch 1.61s user 0.02s system 73% cpu 2.204 total
% notmuch count tag:notmuch
682
--- flint_database.cc 2009-12-08 13:34:24.790284881 +0800
+++ flint_database.cc 2009-12-10 14:22:14.493653956 +0800
@@ -1188,7 +1188,7 @@
termlist.next();
while (!termlist.at_end()) {
- string tname = termlist.get_termname();
+ string tname = termlist.get_termname();
position_table.delete_positionlist(did, tname);
termcount wdf = termlist.get_wdf();
@@ -1278,20 +1278,50 @@
}
if (!modifying || document.internal->terms_modified()) {
- // FIXME - in the case where there is overlap between the new
- // termlist and the old termlist, it would be better to compare the
- // two lists, and make the minimum set of modifications required.
- // This would lead to smaller changesets for replication, and
- // probably be faster overall.
-
- // First, add entries to remove the postings in the underlying record.
Xapian::Internal::RefCntPtr<const FlintWritableDatabase> ptrtothis(this);
FlintTermList termlist(ptrtothis, did);
+ Xapian::TermIterator term = document.termlist_begin();
+ Xapian::TermIterator term_end = document.termlist_end();
+ flint_doclen_t new_doclen = termlist.get_doclength();
+ string old_tname, new_tname;
+
+ total_length -= new_doclen;
+
+ termlist.next();
+ while (true) {
+ bool identical = false;
+ int cmp;
+ if (termlist.at_end() && term == term_end)
+ break;
+ if (!termlist.at_end() && term != term_end) {
+ old_tname = termlist.get_termname();
+ new_tname = *term;
+ cmp = old_tname.compare(new_tname);
+
+ // Check postlist to see whether they are identical
+ if (cmp == 0) {
+ int new_count = term.positionlist_count();
+ int old_count = termlist.positionlist_count();
+ if (old_count == new_count) {
+ PositionIterator it = term.positionlist_begin();
+ PositionIterator it_end = term.positionlist_end();
+ PositionIterator old = termlist.positionlist_begin();
+ if (equal(it, it_end, old))
+ identical = true;
+ }
+ }
+ } else if (termlist.at_end()) {
+ cmp = 2;
+ new_tname = *term;
+ } else {
+ cmp = -2;
+ old_tname = termlist.get_termname();
+ }
- termlist.next();
- while (!termlist.at_end()) {
- string tname = termlist.get_termname();
+ if (cmp < 0) {
+ const string& tname = old_tname;
termcount wdf = termlist.get_wdf();
+ new_doclen -= wdf;
map<string, pair<termcount_diff, termcount_diff> >::iterator i;
i = freq_deltas.find(tname);
@@ -1318,58 +1348,62 @@
// Modifying a document we added/modified since the last flush.
k->second = make_pair('D', 0u);
}
-
- termlist.next();
- }
-
- total_length -= termlist.get_doclength();
-
- flint_doclen_t new_doclen = 0;
- Xapian::TermIterator term = document.termlist_begin();
- Xapian::TermIterator term_end = document.termlist_end();
- for ( ; term != term_end; ++term) {
- // Calculate the new document length
+ } else if (!identical) {
+ const string& tname = new_tname;
termcount wdf = term.get_wdf();
- new_doclen += wdf;
-
- string tname = *term;
- if (tname.size() > MAX_SAFE_TERM_LENGTH)
- throw Xapian::InvalidArgumentError("Term too long (> "STRINGIZE(MAX_SAFE_TERM_LENGTH)"): " + tname);
- map<string, pair<termcount_diff, termcount_diff> >::iterator i;
- i = freq_deltas.find(tname);
- if (i == freq_deltas.end()) {
- freq_deltas.insert(make_pair(tname, make_pair(1, termcount_diff(wdf))));
- } else {
- ++i->second.first;
- i->second.second += wdf;
- }
+ new_doclen += wdf;
+
+ if (cmp > 0) {
+ if (tname.size() > MAX_SAFE_TERM_LENGTH)
+ throw Xapian::InvalidArgumentError("Term too long (> "STRINGIZE(MAX_SAFE_TERM_LENGTH)"): " + tname);
+ map<string, pair<termcount_diff, termcount_diff> >::iterator i;
+ i = freq_deltas.find(tname);
+ if (i == freq_deltas.end()) {
+ freq_deltas.insert(make_pair(tname, make_pair(1, termcount_diff(wdf))));
+ } else {
+ ++i->second.first;
+ i->second.second += wdf;
+ }
+
+ // Add did to tname's postlist
+ map<string, map<docid, pair<char, termcount> > >::iterator j;
+ j = mod_plists.find(tname);
+ if (j == mod_plists.end()) {
+ map<docid, pair<char, termcount> > m;
+ j = mod_plists.insert(make_pair(tname, m)).first;
+ }
+ map<docid, pair<char, termcount> >::iterator k;
+ k = j->second.find(did);
+ if (k != j->second.end()) {
+ Assert(k->second.first == 'D');
+ k->second.first = 'M';
+ k->second.second = wdf;
+ } else {
+ j->second.insert(make_pair(did, make_pair('A', wdf)));
+ }
+ }
+
+ PositionIterator it = term.positionlist_begin();
+ PositionIterator it_end = term.positionlist_end();
+ if (it != it_end) {
+ position_table.set_positionlist(
+ did, tname, it, it_end);
+ } else {
+ position_table.delete_positionlist(did, tname);
+ }
+ }
+ if (termlist.at_end())
+ ++term;
+ else if (term == term_end)
+ termlist.next();
+ else {
+ if (cmp >= 0)
+ ++term;
+ if (cmp <= 0)
+ termlist.next();
+ }
+ }
- // Add did to tname's postlist
- map<string, map<docid, pair<char, termcount> > >::iterator j;
- j = mod_plists.find(tname);
- if (j == mod_plists.end()) {
- map<docid, pair<char, termcount> > m;
- j = mod_plists.insert(make_pair(tname, m)).first;
- }
- map<docid, pair<char, termcount> >::iterator k;
- k = j->second.find(did);
- if (k != j->second.end()) {
- Assert(k->second.first == 'D');
- k->second.first = 'M';
- k->second.second = wdf;
- } else {
- j->second.insert(make_pair(did, make_pair('A', wdf)));
- }
-
- PositionIterator it = term.positionlist_begin();
- PositionIterator it_end = term.positionlist_end();
- if (it != it_end) {
- position_table.set_positionlist(
- did, tname, it, it_end);
- } else {
- position_table.delete_positionlist(did, tname);
- }
- }
LOGLINE(DB, "Calculated doclen for replacement document " << did << " as " << new_doclen);
// Set the termlist
--
Kan-Ru Chen | http://kanru.info
Q: Why are my replies five sentences or less?
A: http://five.sentenc.es/
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 835 bytes
Desc: not available
URL: <http://notmuchmail.org/pipermail/notmuch/attachments/20091210/3a0f0e53/attachment.pgp>
More information about the notmuch
mailing list