/* Extract doc IDs from an ES shard backup (a Lucene index)

/* Extract doc IDs from an ES shard backup (a Lucene index)

Copyright (c) 2025 Internet Archive

Author: Amin Bandali <[email protected]>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or (at
your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

import java.io.IOException;
import java.nio.file.Paths;
import java.util.Arrays;

import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.document.Document;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;

public class GetIDS {
public static void main(String[] args) throws IOException {
Directory indexDirectory = FSDirectory.open(Paths.get("/esdata/bck/16/index"));
IndexReader reader = DirectoryReader.open(indexDirectory);
IndexSearcher searcher = new IndexSearcher(reader);

MatchAllDocsQuery query = new MatchAllDocsQuery();

int maxHits = reader.numDocs();
TopDocs topDocs = searcher.search(query, maxHits);

// System.out.println("Total documents found: " + topDocs.totalHits.value);

for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
// System.out.println("Document index: " + scoreDoc.doc);
for (IndexableField field : doc.getFields()) {
String name = field.name();
if (name.equals("_id")) {
// String stored = String.valueOf(field.fieldType().stored());
String type = "string";
String value = field.stringValue();
if (value == null) {
Number num = field.numericValue();
if (num != null) {
type = "numeric";
value = num.toString();
} else {
BytesRef bin = field.binaryValue();
if (bin != null) {
type = "binary";
// based on BytesRef.utf8ToString()
// for some reason, the first byte is always -1 ???
final char[] arr = new char[bin.length];
int len;
int offset = bin.offset + 1;
int length = bin.length - 1;
try {
len = UnicodeUtil.UTF8toUTF16(bin.bytes, offset, length, arr);
} catch(ArrayIndexOutOfBoundsException e) {
String err = "error aioob: bytes: %s (%s), offset: %d, length: %d";
System.err.println(String.format(err, bin.bytes, Arrays.toString(bin.bytes), offset, length));
break;
}
value = new String(arr, 0, len);
} else {
type = "unknown";
}
}
}
// System.out.println(value + " (type: " + type + ")");
System.out.println(value);
}
}
}

reader.close();
indexDirectory.close();
}
}