/* Extract doc IDs from an ES shard backup (a Lucene index)

Copyright (c) 2025 Internet Archive

Author: Amin Bandali <[email protected]>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or (at
your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <https://www.gnu.org/licenses/>.
*/

import java.io.IOException;
import java.nio.file.Paths;
import java.util.Arrays;

import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.document.Document;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;

public class GetIDS {
   public static void main(String[] args) throws IOException {
       Directory indexDirectory = FSDirectory.open(Paths.get("/esdata/bck/16/index"));
       IndexReader reader = DirectoryReader.open(indexDirectory);
       IndexSearcher searcher = new IndexSearcher(reader);

       MatchAllDocsQuery query = new MatchAllDocsQuery();

       int maxHits = reader.numDocs();
       TopDocs topDocs = searcher.search(query, maxHits);

       // System.out.println("Total documents found: " + topDocs.totalHits.value);

       for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
           Document doc = searcher.doc(scoreDoc.doc);
           // System.out.println("Document index: " + scoreDoc.doc);
           for (IndexableField field : doc.getFields()) {
               String name = field.name();
               if (name.equals("_id")) {
                   // String stored = String.valueOf(field.fieldType().stored());
                   String type = "string";
                   String value = field.stringValue();
                   if (value == null) {
                       Number num = field.numericValue();
                       if (num != null) {
                           type = "numeric";
                           value = num.toString();
                       } else {
                           BytesRef bin = field.binaryValue();
                           if (bin != null) {
                               type = "binary";
                               // based on BytesRef.utf8ToString()
                               // for some reason, the first byte is always -1 ???
                               final char[] arr = new char[bin.length];
                               int len;
                               int offset = bin.offset + 1;
                               int length = bin.length - 1;
                               try {
                                   len = UnicodeUtil.UTF8toUTF16(bin.bytes, offset, length, arr);
                               } catch(ArrayIndexOutOfBoundsException e) {
                                   String err = "error aioob: bytes: %s (%s), offset: %d, length: %d";
                                   System.err.println(String.format(err, bin.bytes, Arrays.toString(bin.bytes), offset, length));
                                   break;
                               }
                               value = new String(arr, 0, len);
                           } else {
                               type = "unknown";
                           }
                       }
                   }
                   // System.out.println(value + " (type: " + type + ")");
                   System.out.println(value);
               }
           }
       }

       reader.close();
       indexDirectory.close();
   }
}