Lab 2 Code

3182bb11 · mohamad.alturky · c739ad57 · 3182bb11 · 3182bb11 · 3182bb11
Commit 3182bb11 authored Apr 24, 2024 by mohamad.alturky
10 changed files
--- a/README.md
+++ b/README.md
@@ -4,30 +4,11 @@
 > <br/>
 > Libraries : Lucene
 > <br/>
-
-## Features
-
-> We can Index and search this files formats
+> Lab : 2
 > <br/>
->> csv
->> <br/>
->> txt
->> <br/>
->> pdf
-
-## Project Packages Structure
-> <div style="color:#9215a0">settings</div> <div style="color:#3b9636">contains program setting and configuration.</div>
-
-> <div style="color:#9215a0">filters</div> <div style="color:#3b9636">contains the file filters to know the target file extension.</div>
-
-> <div style="color:#9215a0">indexers</div> <div style="color:#3b9636">contains the indexers for csv, pdf and txt file extension.</div>
-
-> <div style="color:#9215a0">searchers</div> <div style="color:#3b9636">contains the one class to search in the indexed files.</div>
-
-> <div style="color:#9215a0">representers</div> <div style="color:#3b9636">the return type of the searcher is a list of documents each one encoded with some structure so we have a staregy for each type of encoding to decode</div>
-
-> <div style="color:#9215a0">representer resolver</div> <div style="color:#3b9636">gets a document and return the appropriate representation for this document</div>

-> <div style="color:#9215a0">benchmark class</div> <div style="color:#3b9636">caluclates the time consumed to execute a function</div>
+## The New Things Added To The Code
+> <div style="color:#9215a0">analyzers package</div> <div style="color:#3b9636">contains the LuceneAnalyzer class witch has the analyze function and the CustomAnalyzerWrapperBuilder class with returns the PerFieldAnalyzerWrapper with the map of analyzers and keys  </div>

-> <div style="color:#9215a0">search engine</div> <div style="color:#3b9636">encapsulates the indexing and searching process. it contains a list of the indexers each one knows how to index a file and have the searcher to search for a query the output of it will be forwarded to the representers resolver to get the appropriate representation.</div>
+> <div style="color:#9215a0">main function</div> <div style="color:#3b9636">contains usage of some analyzers and the creation of the custom analyzer.</div>
+> <div style="color:#9215a0">edit the indexers</div> <div style="color:#3b9636">to get an instance of PerFieldAnalyzerWrapper from the constructor.</div>
--- a/pom.xml
+++ b/pom.xml
@@ -42,5 +42,10 @@
            <artifactId>lucene</artifactId>
            <version>1.0-SNAPSHOT</version>
        </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-analyzers-common</artifactId>
+            <version>8.11.3</version>
+        </dependency>
    </dependencies>
 </project>
\ No newline at end of file
--- a/src/main/java/com/search/lucene/Lucene.java
+++ b/src/main/java/com/search/lucene/Lucene.java
@@ -2,7 +2,9 @@ package com.search.lucene;

 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.List;

+import com.search.lucene.analyzers.LuceneAnalyzer;
 import com.search.lucene.documents.representers.resolver.DocumentToStringRepresenterResolver;
 import com.search.lucene.documents.representers.resolver.IDocumentRepresenterResolver;
 import com.search.lucene.engine.abstractions.ISearchEngine;
@@ -12,6 +14,14 @@ import com.search.lucene.indexers.implementations.TextFileIndexer;
 import com.search.lucene.performance.Benchmarker;
 import com.search.lucene.searchers.implementations.IndexedDocumentSearcher;
 import com.search.lucene.settings.Constants;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.*;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
+import org.apache.lucene.analysis.en.PorterStemFilterFactory;
+import org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.queryparser.classic.ParseException;
 import org.apache.lucene.search.ScoreDoc;
@@ -20,63 +30,58 @@ import org.apache.lucene.search.TopDocs;
 public class Lucene {

    public static void main(String[] args) throws Exception {
-        IEngineBuilder builder = new LuceneEngineBuilder();
-        ISearchEngine searchEngine = builder.build();
+        testAnalyzers();
+    }
+
+    private static void testAnalyzers() {
+        try {
+
+            String text = "some text from a good man";
+
+            StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
+            List<String> tokens = LuceneAnalyzer.analyze(standardAnalyzer, text);
+            System.out.println(standardAnalyzer.getClass().getName() + " :");
+            tokens.forEach(System.out::println);
+
+
+            SimpleAnalyzer simpleAnalyzer = new SimpleAnalyzer();
+            List<String> simpleTokens = LuceneAnalyzer.analyze(simpleAnalyzer, text);
+            System.out.println(simpleAnalyzer.getClass().getName() + " :");
+            simpleTokens.forEach(System.out::println);
+
+            WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer();
+            List<String> whiteSpaceTokens = LuceneAnalyzer.analyze(whitespaceAnalyzer, text);
+            System.out.println(whitespaceAnalyzer.getClass().getName() + " :");
+            whiteSpaceTokens.forEach(System.out::println);
+
+            KeywordAnalyzer keywordAnalyzer = new KeywordAnalyzer();
+            List<String> keywordTokens = LuceneAnalyzer.analyze(keywordAnalyzer, text);
+
+            System.out.println(keywordAnalyzer.getClass().getName() + " :");
+            keywordTokens.forEach(System.out::println);
+
+            EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer();
+            List<String> englishTokens = LuceneAnalyzer.analyze(englishAnalyzer, text);
+            System.out.println(englishAnalyzer.getClass().getName() + " :");
+            englishTokens.forEach(System.out::println);
+

-        Benchmarker.benchmark(() -> {
-            searchEngine.createIndexesForDirectory("data");
-        });
+            Analyzer customAnalyzer = CustomAnalyzer.builder()
+                    .withTokenizer(StandardTokenizerFactory.NAME)
+                    .addTokenFilter(LowerCaseFilterFactory.NAME)
+                    .addTokenFilter(StopFilterFactory.NAME)
+                    .addTokenFilter(PorterStemFilterFactory.NAME)
+                    .addTokenFilter(CapitalizationFilterFactory.class)
+                    .build();
+            List<String> customTokens = LuceneAnalyzer.analyze(customAnalyzer, text);

+            System.out.println(customAnalyzer.getClass().getName() + " :");
+            customTokens.forEach(System.out::println);

-        ArrayList<Document> results = searchEngine.search("Julien Leclercq");
-        IDocumentRepresenterResolver<String> resolver = new DocumentToStringRepresenterResolver();
-        System.out.println();
-        for (var result : results) {
-            System.out.println(resolver.resolveRepresentation(result));
+        } catch (IOException e) {
+            System.out.println(e.getMessage());
        }
    }


-//    private static final String indexDir = "index";
-//    private static final String dataDir = "data";
-//    private static TextFileIndexer indexer;
-//    private static IndexedDocumentSearcher textFileSearcher;
-//
-//    public static void main(String[] args) {
-//        try {
-//            createIndex();
-//            search("Kota");
-//            search("s");
-//        } catch (IOException e) {
-//            e.printStackTrace();
-//        } catch (ParseException e) {
-//            e.printStackTrace();
-//        }
-//    }
-//
-//    private static void createIndex() throws IOException {
-//        indexer = new TextFileIndexer(indexDir);
-//        int numIndexed;
-//        long startTime = System.currentTimeMillis();
-//        numIndexed = indexer.createIndex(dataDir);
-//        long endTime = System.currentTimeMillis();
-//        indexer.close();
-//        System.out.println(numIndexed+" File indexed, time taken: "
-//                +(endTime-startTime)+" ms");
-//    }
-//
-//    private static void search(String searchQuery) throws IOException, ParseException {
-//        textFileSearcher = new IndexedDocumentSearcher(indexDir,Constants.CONTENT);
-//        long startTime = System.currentTimeMillis();
-//        TopDocs hits = textFileSearcher.search(searchQuery);
-//        long endTime = System.currentTimeMillis();
-//
-//        System.out.println(hits.totalHits +
-//                " documents found. Time :" + (endTime - startTime));
-//        for(ScoreDoc scoreDoc : hits.scoreDocs) {
-//            Document doc = textFileSearcher.getDocument(scoreDoc);
-//            System.out.println("File: "
-//                    + doc.get(Constants.FILE_PATH));
-//        }
-//    }
 }
--- a/src/main/java/com/search/lucene/analyzers/CustomAnalyzerWrapperBuilder.java
+++ b/src/main/java/com/search/lucene/analyzers/CustomAnalyzerWrapperBuilder.java
+package com.search.lucene.analyzers;
+
+import com.search.lucene.settings.Constants;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.KeywordAnalyzer;
+import org.apache.lucene.analysis.en.EnglishAnalyzer;
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class CustomAnalyzerWrapperBuilder {
+    public PerFieldAnalyzerWrapper build() {
+        Map<String, Analyzer> map = new HashMap<>();
+
+        map.put(Constants.ID, new KeywordAnalyzer());
+        map.put(Constants.FILE_PATH, new KeywordAnalyzer());
+        map.put(Constants.FILE_NAME, new KeywordAnalyzer());
+
+        map.put(Constants.CONTENT, new EnglishAnalyzer());
+        map.put(Constants.DESCRIPTION, new EnglishAnalyzer());
+
+        return new PerFieldAnalyzerWrapper(new StandardAnalyzer(),map);
+    }
+
+}
--- a/src/main/java/com/search/lucene/analyzers/LuceneAnalyzer.java
+++ b/src/main/java/com/search/lucene/analyzers/LuceneAnalyzer.java
+package com.search.lucene.analyzers;
+
+import com.search.lucene.settings.Constants;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+public class LuceneAnalyzer {
+    public static List<String> analyze(Analyzer analyzer, String text)
+            throws IOException {
+        List<String> tokens = new ArrayList<>();
+        TokenStream tokenStream = analyzer.tokenStream(Constants.DESCRIPTION, text);
+        CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
+        tokenStream.reset();
+        while (tokenStream.incrementToken()) {
+            tokens.add(attr.toString());
+        }
+        return tokens;
+    }
+}
--- a/src/main/java/com/search/lucene/engine/builder/implementations/LuceneEngineBuilder.java
+++ b/src/main/java/com/search/lucene/engine/builder/implementations/LuceneEngineBuilder.java
 package com.search.lucene.engine.builder.implementations;

+import com.search.lucene.analyzers.CustomAnalyzerWrapperBuilder;
 import com.search.lucene.engine.abstractions.ISearchEngine;
 import com.search.lucene.engine.builder.abstractions.IEngineBuilder;
 import com.search.lucene.engine.implementations.LuceneEngine;
@@ -21,9 +22,11 @@ public class LuceneEngineBuilder implements IEngineBuilder {
    public ISearchEngine build() throws IOException {

        ArrayList<IFileIndexer> indexers = new ArrayList<>();
-        indexers.add(new CSVFileIndexer(IndexPaths.CSV_INDEX_FOLDER_PATH));
-        indexers.add(new PDFFileIndexer(IndexPaths.PDF_INDEX_FOLDER_PATH));
-        indexers.add(new TextFileIndexer(IndexPaths.TEXT_INDEX_FOLDER_PATH));
+        CustomAnalyzerWrapperBuilder builder = new CustomAnalyzerWrapperBuilder();
+        var wrapper = builder.build();
+        indexers.add(new CSVFileIndexer(IndexPaths.CSV_INDEX_FOLDER_PATH,wrapper));
+        indexers.add(new PDFFileIndexer(IndexPaths.PDF_INDEX_FOLDER_PATH,wrapper));
+        indexers.add(new TextFileIndexer(IndexPaths.TEXT_INDEX_FOLDER_PATH,wrapper));

        ArrayList<ISearcher> searchers = new ArrayList<>();
        searchers.add(new IndexedDocumentSearcher(IndexPaths.CSV_INDEX_FOLDER_PATH, Constants.VALUE));

--- a/src/main/java/com/search/lucene/indexers/implementations/CSVFileIndexer.java
+++ b/src/main/java/com/search/lucene/indexers/implementations/CSVFileIndexer.java
@@ -16,6 +16,7 @@ import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;

 import java.io.File;
 import java.io.FileReader;
@@ -27,14 +28,14 @@ public class CSVFileIndexer implements IFileIndexer {
    private final IndexWriter writer;
    private final IFileFilter filter;

-    public CSVFileIndexer(String indexDirectoryPath) throws IOException {
+    public CSVFileIndexer(String indexDirectoryPath, PerFieldAnalyzerWrapper wrapper) throws IOException {
        this.filter = new CSVFileFilter();
        //this directory will contain the indexes
        Directory indexDirectory =
                FSDirectory.open(Paths.get(indexDirectoryPath));

        //create the indexer
-        writer = new IndexWriter(indexDirectory, new IndexWriterConfig(new StandardAnalyzer()));
+        writer = new IndexWriter(indexDirectory, new IndexWriterConfig(wrapper));
    }

    @Override

--- a/src/main/java/com/search/lucene/indexers/implementations/PDFFileIndexer.java
+++ b/src/main/java/com/search/lucene/indexers/implementations/PDFFileIndexer.java
@@ -5,6 +5,7 @@ import com.search.lucene.file.filters.implementations.PDFFileFilter;
 import com.search.lucene.indexers.abstractions.IFileIndexer;
 import com.search.lucene.settings.Constants;
 import com.search.lucene.settings.RepresentationSchema;
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.TextField;
@@ -26,13 +27,13 @@ public class PDFFileIndexer implements IFileIndexer {
    private final IndexWriter writer;
    private final IFileFilter filter;

-    public PDFFileIndexer(String indexDirectoryPath) throws IOException {
+    public PDFFileIndexer(String indexDirectoryPath, PerFieldAnalyzerWrapper wrapper) throws IOException {
        //this directory will contain the indexes
        Directory indexDirectory =
                FSDirectory.open(Paths.get(indexDirectoryPath));
        this.filter = new PDFFileFilter();
        //create the indexer
-        writer = new IndexWriter(indexDirectory, new IndexWriterConfig(new StandardAnalyzer()));
+        writer = new IndexWriter(indexDirectory, new IndexWriterConfig(wrapper));
    }
    @Override
    public void close() throws CorruptIndexException, IOException {

--- a/src/main/java/com/search/lucene/indexers/implementations/TextFileIndexer.java
+++ b/src/main/java/com/search/lucene/indexers/implementations/TextFileIndexer.java
@@ -10,6 +10,7 @@ import com.search.lucene.file.filters.implementations.TextFileFilter;
 import com.search.lucene.indexers.abstractions.IFileIndexer;
 import com.search.lucene.settings.Constants;
 import com.search.lucene.settings.RepresentationSchema;
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@@ -24,14 +25,14 @@ public class TextFileIndexer implements IFileIndexer {
    private final IndexWriter writer;
    private final IFileFilter filter;

-    public TextFileIndexer(String indexDirectoryPath) throws IOException {
+    public TextFileIndexer(String indexDirectoryPath, PerFieldAnalyzerWrapper wrapper) throws IOException {
        this.filter = new TextFileFilter();
        //this directory will contain the indexes
        Directory indexDirectory =
                FSDirectory.open(Paths.get(indexDirectoryPath));

        //create the indexer
-        writer = new IndexWriter(indexDirectory, new IndexWriterConfig(new StandardAnalyzer()));
+        writer = new IndexWriter(indexDirectory, new IndexWriterConfig(wrapper));
    }
    @Override
    public void close() throws CorruptIndexException, IOException {

--- a/src/main/java/com/search/lucene/settings/Constants.java
+++ b/src/main/java/com/search/lucene/settings/Constants.java
@@ -8,7 +8,9 @@ public class Constants {
    public static final String FILE_PATH = "filepath";
    public static final String COLUMN = "column";
    public static final String VALUE = "value";
+    public static final String ID = "id";
    public static final String ROW = "row";
    public static final int MAX_SEARCH = 10;
+    public static final String DESCRIPTION = "description";
 }