-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Use the doc-values skip index to skip per-doc value lookups in LongRangeFacetCutter #16268
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
1bf8688
2a8d01f
dc38fbc
88fe293
725a74f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,11 @@ | |
| import org.apache.lucene.facet.MultiLongValues; | ||
| import org.apache.lucene.facet.MultiLongValuesSource; | ||
| import org.apache.lucene.facet.range.LongRange; | ||
| import org.apache.lucene.index.DocValues; | ||
| import org.apache.lucene.index.DocValuesSkipper; | ||
| import org.apache.lucene.index.LeafReaderContext; | ||
| import org.apache.lucene.index.NumericDocValues; | ||
| import org.apache.lucene.index.SortedNumericDocValues; | ||
| import org.apache.lucene.sandbox.facet.cutters.FacetCutter; | ||
| import org.apache.lucene.sandbox.facet.cutters.LeafFacetCutter; | ||
| import org.apache.lucene.search.LongValues; | ||
|
|
@@ -42,6 +47,10 @@ public abstract class LongRangeFacetCutter implements FacetCutter { | |
|
|
||
| // TODO: refactor - weird that we have both multi and single here. | ||
| final LongValuesSource singleValues; | ||
|
|
||
| // Field name whose skip index is used on the single-valued path, or null when faceting a source. | ||
| final String skipField; | ||
|
|
||
| final LongRangeAndPos[] sortedRanges; | ||
|
|
||
| final int requestedRangeCount; | ||
|
|
@@ -62,32 +71,51 @@ static LongRangeFacetCutter createSingleOrMultiValued( | |
| MultiLongValuesSource longValuesSource, | ||
| LongValuesSource singleLongValuesSource, | ||
| LongRange[] longRanges) { | ||
| return createSingleOrMultiValued(longValuesSource, singleLongValuesSource, longRanges, null); | ||
| } | ||
|
|
||
| /** Same as above, but uses the {@code skipField} skip index on the single-valued path. */ | ||
| static LongRangeFacetCutter createSingleOrMultiValued( | ||
| MultiLongValuesSource longValuesSource, | ||
| LongValuesSource singleLongValuesSource, | ||
| LongRange[] longRanges, | ||
| String skipField) { | ||
| if (areOverlappingRanges(longRanges)) { | ||
| return new OverlappingLongRangeFacetCutter( | ||
| longValuesSource, singleLongValuesSource, longRanges); | ||
| longValuesSource, singleLongValuesSource, longRanges, skipField); | ||
| } | ||
| return new NonOverlappingLongRangeFacetCutter( | ||
| longValuesSource, singleLongValuesSource, longRanges); | ||
| longValuesSource, singleLongValuesSource, longRanges, skipField); | ||
| } | ||
|
|
||
| public static LongRangeFacetCutter create( | ||
| MultiLongValuesSource longValuesSource, LongRange[] longRanges) { | ||
| return createSingleOrMultiValued(longValuesSource, null, longRanges); | ||
| return createSingleOrMultiValued(longValuesSource, null, longRanges, null); | ||
| } | ||
|
|
||
| /** Create {@link FacetCutter} for a long field by name, using its skip index when present. */ | ||
| public static LongRangeFacetCutter create(String field, LongRange[] longRanges) { | ||
| // Leave the single-valued source null. The skip path reads the field directly, and a | ||
| // multi-valued segment must fall back to the multi-valued leaf cutter. | ||
| return createSingleOrMultiValued( | ||
| MultiLongValuesSource.fromLongField(field), null, longRanges, field); | ||
| } | ||
|
|
||
| // caller handles conversion of Doubles and DoubleRange to Long and LongRange | ||
| // ranges need not be sorted | ||
| LongRangeFacetCutter( | ||
| MultiLongValuesSource longValuesSource, | ||
| LongValuesSource singleLongValuesSource, | ||
| LongRange[] longRanges) { | ||
| LongRange[] longRanges, | ||
| String skipField) { | ||
| super(); | ||
| valuesSource = longValuesSource; | ||
| if (singleLongValuesSource != null) { | ||
| singleValues = singleLongValuesSource; | ||
| } else { | ||
| singleValues = MultiLongValuesSource.unwrapSingleton(valuesSource); | ||
| } | ||
| this.skipField = skipField; | ||
|
|
||
| sortedRanges = new LongRangeAndPos[longRanges.length]; | ||
| requestedRangeCount = longRanges.length; | ||
|
|
@@ -124,6 +152,39 @@ public static LongRangeFacetCutter create( | |
| */ | ||
| abstract List<InclusiveRange> buildElementaryIntervals(); | ||
|
|
||
| /** | ||
| * Returns the {@link DocValuesSkipper} for {@link #skipField} in this segment. Null when: no skip | ||
| * field is configured, the field has no skip index, or some doc in this segment has more than one | ||
| * value. | ||
| */ | ||
| final DocValuesSkipper maybeSkipper(LeafReaderContext context) throws IOException { | ||
| if (skipField == null) { | ||
| return null; | ||
| } | ||
| SortedNumericDocValues sortedNumeric = DocValues.getSortedNumeric(context.reader(), skipField); | ||
| if (DocValues.unwrapSingleton(sortedNumeric) == null) { | ||
| return null; | ||
| } | ||
| return context.reader().getDocValuesSkipper(skipField); | ||
| } | ||
|
|
||
| /** Single-valued {@link LongValues} for {@link #skipField} in this segment. */ | ||
| final LongValues skipFieldValues(LeafReaderContext context) throws IOException { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this method a part of the single value unwrapping logic that we want to remove for now?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nope, this is part of the core skipper path.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh I see now. The initial logic was: Then in the second revision we tried: And now we are back to the first approach? The reason I got confused is that I thought we also used the skipper optimization for multi-valued fields, but I see now that the description explicitly calls out single-valued fields only. I wonder why that is the case? I thought Lucene supported skippers for multi-valued fields as well.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to clarify why I'm asking about multi valued fields: the current version still uses the unwrap-singleton optimization, but only together with the skipper. So there are basically two optimizations here, and I thought that was what you wanted to avoid? If that’s the case, and if multi-valued fields support skippers, I’d suggest scoping this PR to the skipper optimization only, for both single- and multi-valued fields, and moving the single-valued unwrapping optimization to a follow-up PR. WDYT?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I'm getting confused in the naming here. I have removed the change that was not related to skipper, The unwrapSingleton call is used to detect the segment is single-valued and to get the single-valued NumericDocValues. The other change, now removed, caused single-valued segments with no skipper to always go to the single-valued cutter when it should depend on the existing logic. Thanks for this and other comments, I'll look into adding multi-valued skipper when I get the time and do it in this PR.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we rename |
||
| NumericDocValues values = | ||
| DocValues.unwrapSingleton(DocValues.getSortedNumeric(context.reader(), skipField)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we deduplicate the calls to |
||
| return new LongValues() { | ||
| @Override | ||
| public long longValue() throws IOException { | ||
| return values.longValue(); | ||
| } | ||
|
|
||
| @Override | ||
| public boolean advanceExact(int doc) throws IOException { | ||
| return values.advanceExact(doc); | ||
| } | ||
| }; | ||
| } | ||
|
|
||
| private static boolean areOverlappingRanges(LongRange[] ranges) { | ||
| if (ranges.length == 0) { | ||
| return false; | ||
|
|
@@ -252,29 +313,98 @@ abstract static class LongRangeSingleValuedLeafFacetCutter implements LeafFacetC | |
|
|
||
| IntervalTracker requestedIntervalTracker; | ||
|
|
||
| private final DocValuesSkipper skipper; | ||
|
|
||
| // advanceSkipper's decisions for the current block; the fields below hold while doc <= | ||
| // upToInclusive, after which it runs again for the next block. | ||
| private int upToInclusive = -1; | ||
| // Whether every value in the block maps to the single interval upToIntervalOrd. | ||
| private boolean upToSameInterval; | ||
| // Whether every doc in the block has a value. | ||
| private boolean upToDense; | ||
| private int upToIntervalOrd; | ||
|
|
||
| // Interval of the previous doc with a value, for replaying the tracker on a repeat. | ||
| private int previousIntervalOrd = -1; | ||
|
|
||
| LongRangeSingleValuedLeafFacetCutter(LongValues longValues, long[] boundaries, int[] pos) { | ||
| this(longValues, boundaries, pos, null); | ||
| } | ||
|
|
||
| LongRangeSingleValuedLeafFacetCutter( | ||
| LongValues longValues, long[] boundaries, int[] pos, DocValuesSkipper skipper) { | ||
| this.longValues = longValues; | ||
| this.boundaries = boundaries; | ||
| this.pos = pos; | ||
| this.skipper = skipper; | ||
| } | ||
|
|
||
| @Override | ||
| public boolean advanceExact(int doc) throws IOException { | ||
| if (longValues.advanceExact(doc) == false) { | ||
| return false; | ||
| if (skipper != null && doc > upToInclusive) { | ||
| advanceSkipper(doc); | ||
| } | ||
| if (requestedIntervalTracker != null) { | ||
| requestedIntervalTracker.clear(); | ||
|
|
||
| int intervalOrd; | ||
| if (upToSameInterval) { | ||
| // Reuse the cached ordinal, skipping the binary search. A dense block also skips the value | ||
| // lookup, a sparse one still needs advanceExact to know whether this doc has a value. | ||
| if (upToDense == false && longValues.advanceExact(doc) == false) { | ||
| return false; | ||
| } | ||
| intervalOrd = upToIntervalOrd; | ||
| } else if (longValues.advanceExact(doc)) { | ||
| intervalOrd = processValue(longValues.longValue()); | ||
| } else { | ||
| return false; | ||
| } | ||
| elementaryIntervalOrd = processValue(longValues.longValue()); | ||
| maybeRollUp(requestedIntervalTracker); | ||
|
|
||
| elementaryIntervalOrd = intervalOrd; | ||
| if (requestedIntervalTracker != null) { | ||
| requestedIntervalTracker.freeze(); | ||
| if (skipper != null && intervalOrd == previousIntervalOrd) { | ||
| // Same interval as the previous doc, so replay its frozen rollup instead of rebuilding. | ||
| requestedIntervalTracker.rewind(); | ||
| } else { | ||
| requestedIntervalTracker.clear(); | ||
| maybeRollUp(requestedIntervalTracker); | ||
| requestedIntervalTracker.freeze(); | ||
| previousIntervalOrd = intervalOrd; | ||
| } | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| private void advanceSkipper(int doc) throws IOException { | ||
| if (doc > skipper.maxDocID(0)) { | ||
| skipper.advance(doc); | ||
| } | ||
| upToSameInterval = false; | ||
|
|
||
| if (skipper.minDocID(0) > doc) { | ||
| // Corner case which happens if doc doesn't have a value and is between two intervals of the | ||
| // skip index. Fall back to per-doc lookups until the next block. | ||
| upToInclusive = skipper.minDocID(0) - 1; | ||
| return; | ||
| } | ||
|
|
||
| upToInclusive = skipper.maxDocID(0); | ||
| // Climb to the highest level that still maps to a single interval. | ||
| for (int level = 0; level < skipper.numLevels(); ++level) { | ||
| // Long fields store raw values, skipper's min/max maps straight into the boundary space. | ||
| int minInterval = processValue(skipper.minValue(level)); | ||
| int maxInterval = processValue(skipper.maxValue(level)); | ||
| if (minInterval != maxInterval) { | ||
| break; | ||
| } | ||
| upToInclusive = skipper.maxDocID(level); | ||
| upToSameInterval = true; | ||
| upToIntervalOrd = minInterval; | ||
| int totalDocsAtLevel = skipper.maxDocID(level) - skipper.minDocID(level) + 1; | ||
| upToDense = skipper.docCount(level) == totalDocsAtLevel; | ||
| } | ||
| } | ||
|
|
||
| // Returns the value of the interval v belongs or lastIntervalSeen | ||
| // if no processing is done, it returns the lastIntervalSeen | ||
| private int processValue(long v) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
WDYT about renaming this to
fieldName?skipis a bit confusing, since the field does not necessarily have a skipper even if this value is set.