Commit 3182bb11 authored by mohamad.alturky's avatar mohamad.alturky

Lab 2 Code

parent c739ad57
......@@ -4,30 +4,11 @@
> <br/>
> Libraries : Lucene
> <br/>
## Features
> We can Index and search this files formats
> Lab : 2
> <br/>
>> csv
>> <br/>
>> txt
>> <br/>
>> pdf
## Project Packages Structure
> <div style="color:#9215a0">settings</div> <div style="color:#3b9636">contains program setting and configuration.</div>
> <div style="color:#9215a0">filters</div> <div style="color:#3b9636">contains the file filters to know the target file extension.</div>
> <div style="color:#9215a0">indexers</div> <div style="color:#3b9636">contains the indexers for csv, pdf and txt file extension.</div>
> <div style="color:#9215a0">searchers</div> <div style="color:#3b9636">contains the one class to search in the indexed files.</div>
> <div style="color:#9215a0">representers</div> <div style="color:#3b9636">the return type of the searcher is a list of documents each one encoded with some structure so we have a staregy for each type of encoding to decode</div>
> <div style="color:#9215a0">representer resolver</div> <div style="color:#3b9636">gets a document and return the appropriate representation for this document</div>
> <div style="color:#9215a0">benchmark class</div> <div style="color:#3b9636">caluclates the time consumed to execute a function</div>
## The New Things Added To The Code
> <div style="color:#9215a0">analyzers package</div> <div style="color:#3b9636">contains the LuceneAnalyzer class witch has the analyze function and the CustomAnalyzerWrapperBuilder class with returns the PerFieldAnalyzerWrapper with the map of analyzers and keys </div>
> <div style="color:#9215a0">search engine</div> <div style="color:#3b9636">encapsulates the indexing and searching process. it contains a list of the indexers each one knows how to index a file and have the searcher to search for a query the output of it will be forwarded to the representers resolver to get the appropriate representation.</div>
> <div style="color:#9215a0">main function</div> <div style="color:#3b9636">contains usage of some analyzers and the creation of the custom analyzer.</div>
> <div style="color:#9215a0">edit the indexers</div> <div style="color:#3b9636">to get an instance of PerFieldAnalyzerWrapper from the constructor.</div>
......@@ -42,5 +42,10 @@
<artifactId>lucene</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>8.11.3</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
......@@ -2,7 +2,9 @@ package com.search.lucene;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import com.search.lucene.analyzers.LuceneAnalyzer;
import com.search.lucene.documents.representers.resolver.DocumentToStringRepresenterResolver;
import com.search.lucene.documents.representers.resolver.IDocumentRepresenterResolver;
import com.search.lucene.engine.abstractions.ISearchEngine;
......@@ -12,6 +14,14 @@ import com.search.lucene.indexers.implementations.TextFileIndexer;
import com.search.lucene.performance.Benchmarker;
import com.search.lucene.searchers.implementations.IndexedDocumentSearcher;
import com.search.lucene.settings.Constants;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.*;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.en.PorterStemFilterFactory;
import org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.ScoreDoc;
......@@ -20,63 +30,58 @@ import org.apache.lucene.search.TopDocs;
public class Lucene {
public static void main(String[] args) throws Exception {
IEngineBuilder builder = new LuceneEngineBuilder();
ISearchEngine searchEngine = builder.build();
testAnalyzers();
}
private static void testAnalyzers() {
try {
String text = "some text from a good man";
StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
List<String> tokens = LuceneAnalyzer.analyze(standardAnalyzer, text);
System.out.println(standardAnalyzer.getClass().getName() + " :");
tokens.forEach(System.out::println);
SimpleAnalyzer simpleAnalyzer = new SimpleAnalyzer();
List<String> simpleTokens = LuceneAnalyzer.analyze(simpleAnalyzer, text);
System.out.println(simpleAnalyzer.getClass().getName() + " :");
simpleTokens.forEach(System.out::println);
WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer();
List<String> whiteSpaceTokens = LuceneAnalyzer.analyze(whitespaceAnalyzer, text);
System.out.println(whitespaceAnalyzer.getClass().getName() + " :");
whiteSpaceTokens.forEach(System.out::println);
KeywordAnalyzer keywordAnalyzer = new KeywordAnalyzer();
List<String> keywordTokens = LuceneAnalyzer.analyze(keywordAnalyzer, text);
System.out.println(keywordAnalyzer.getClass().getName() + " :");
keywordTokens.forEach(System.out::println);
EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer();
List<String> englishTokens = LuceneAnalyzer.analyze(englishAnalyzer, text);
System.out.println(englishAnalyzer.getClass().getName() + " :");
englishTokens.forEach(System.out::println);
Benchmarker.benchmark(() -> {
searchEngine.createIndexesForDirectory("data");
});
Analyzer customAnalyzer = CustomAnalyzer.builder()
.withTokenizer(StandardTokenizerFactory.NAME)
.addTokenFilter(LowerCaseFilterFactory.NAME)
.addTokenFilter(StopFilterFactory.NAME)
.addTokenFilter(PorterStemFilterFactory.NAME)
.addTokenFilter(CapitalizationFilterFactory.class)
.build();
List<String> customTokens = LuceneAnalyzer.analyze(customAnalyzer, text);
System.out.println(customAnalyzer.getClass().getName() + " :");
customTokens.forEach(System.out::println);
ArrayList<Document> results = searchEngine.search("Julien Leclercq");
IDocumentRepresenterResolver<String> resolver = new DocumentToStringRepresenterResolver();
System.out.println();
for (var result : results) {
System.out.println(resolver.resolveRepresentation(result));
} catch (IOException e) {
System.out.println(e.getMessage());
}
}
// private static final String indexDir = "index";
// private static final String dataDir = "data";
// private static TextFileIndexer indexer;
// private static IndexedDocumentSearcher textFileSearcher;
//
// public static void main(String[] args) {
// try {
// createIndex();
// search("Kota");
// search("s");
// } catch (IOException e) {
// e.printStackTrace();
// } catch (ParseException e) {
// e.printStackTrace();
// }
// }
//
// private static void createIndex() throws IOException {
// indexer = new TextFileIndexer(indexDir);
// int numIndexed;
// long startTime = System.currentTimeMillis();
// numIndexed = indexer.createIndex(dataDir);
// long endTime = System.currentTimeMillis();
// indexer.close();
// System.out.println(numIndexed+" File indexed, time taken: "
// +(endTime-startTime)+" ms");
// }
//
// private static void search(String searchQuery) throws IOException, ParseException {
// textFileSearcher = new IndexedDocumentSearcher(indexDir,Constants.CONTENT);
// long startTime = System.currentTimeMillis();
// TopDocs hits = textFileSearcher.search(searchQuery);
// long endTime = System.currentTimeMillis();
//
// System.out.println(hits.totalHits +
// " documents found. Time :" + (endTime - startTime));
// for(ScoreDoc scoreDoc : hits.scoreDocs) {
// Document doc = textFileSearcher.getDocument(scoreDoc);
// System.out.println("File: "
// + doc.get(Constants.FILE_PATH));
// }
// }
}
package com.search.lucene.analyzers;
import com.search.lucene.settings.Constants;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import java.util.HashMap;
import java.util.Map;
public class CustomAnalyzerWrapperBuilder {
public PerFieldAnalyzerWrapper build() {
Map<String, Analyzer> map = new HashMap<>();
map.put(Constants.ID, new KeywordAnalyzer());
map.put(Constants.FILE_PATH, new KeywordAnalyzer());
map.put(Constants.FILE_NAME, new KeywordAnalyzer());
map.put(Constants.CONTENT, new EnglishAnalyzer());
map.put(Constants.DESCRIPTION, new EnglishAnalyzer());
return new PerFieldAnalyzerWrapper(new StandardAnalyzer(),map);
}
}
package com.search.lucene.analyzers;
import com.search.lucene.settings.Constants;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class LuceneAnalyzer {
public static List<String> analyze(Analyzer analyzer, String text)
throws IOException {
List<String> tokens = new ArrayList<>();
TokenStream tokenStream = analyzer.tokenStream(Constants.DESCRIPTION, text);
CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
tokens.add(attr.toString());
}
return tokens;
}
}
package com.search.lucene.engine.builder.implementations;
import com.search.lucene.analyzers.CustomAnalyzerWrapperBuilder;
import com.search.lucene.engine.abstractions.ISearchEngine;
import com.search.lucene.engine.builder.abstractions.IEngineBuilder;
import com.search.lucene.engine.implementations.LuceneEngine;
......@@ -21,9 +22,11 @@ public class LuceneEngineBuilder implements IEngineBuilder {
public ISearchEngine build() throws IOException {
ArrayList<IFileIndexer> indexers = new ArrayList<>();
indexers.add(new CSVFileIndexer(IndexPaths.CSV_INDEX_FOLDER_PATH));
indexers.add(new PDFFileIndexer(IndexPaths.PDF_INDEX_FOLDER_PATH));
indexers.add(new TextFileIndexer(IndexPaths.TEXT_INDEX_FOLDER_PATH));
CustomAnalyzerWrapperBuilder builder = new CustomAnalyzerWrapperBuilder();
var wrapper = builder.build();
indexers.add(new CSVFileIndexer(IndexPaths.CSV_INDEX_FOLDER_PATH,wrapper));
indexers.add(new PDFFileIndexer(IndexPaths.PDF_INDEX_FOLDER_PATH,wrapper));
indexers.add(new TextFileIndexer(IndexPaths.TEXT_INDEX_FOLDER_PATH,wrapper));
ArrayList<ISearcher> searchers = new ArrayList<>();
searchers.add(new IndexedDocumentSearcher(IndexPaths.CSV_INDEX_FOLDER_PATH, Constants.VALUE));
......
......@@ -16,6 +16,7 @@ import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import java.io.File;
import java.io.FileReader;
......@@ -27,14 +28,14 @@ public class CSVFileIndexer implements IFileIndexer {
private final IndexWriter writer;
private final IFileFilter filter;
public CSVFileIndexer(String indexDirectoryPath) throws IOException {
public CSVFileIndexer(String indexDirectoryPath, PerFieldAnalyzerWrapper wrapper) throws IOException {
this.filter = new CSVFileFilter();
//this directory will contain the indexes
Directory indexDirectory =
FSDirectory.open(Paths.get(indexDirectoryPath));
//create the indexer
writer = new IndexWriter(indexDirectory, new IndexWriterConfig(new StandardAnalyzer()));
writer = new IndexWriter(indexDirectory, new IndexWriterConfig(wrapper));
}
@Override
......
......@@ -5,6 +5,7 @@ import com.search.lucene.file.filters.implementations.PDFFileFilter;
import com.search.lucene.indexers.abstractions.IFileIndexer;
import com.search.lucene.settings.Constants;
import com.search.lucene.settings.RepresentationSchema;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
......@@ -26,13 +27,13 @@ public class PDFFileIndexer implements IFileIndexer {
private final IndexWriter writer;
private final IFileFilter filter;
public PDFFileIndexer(String indexDirectoryPath) throws IOException {
public PDFFileIndexer(String indexDirectoryPath, PerFieldAnalyzerWrapper wrapper) throws IOException {
//this directory will contain the indexes
Directory indexDirectory =
FSDirectory.open(Paths.get(indexDirectoryPath));
this.filter = new PDFFileFilter();
//create the indexer
writer = new IndexWriter(indexDirectory, new IndexWriterConfig(new StandardAnalyzer()));
writer = new IndexWriter(indexDirectory, new IndexWriterConfig(wrapper));
}
@Override
public void close() throws CorruptIndexException, IOException {
......
......@@ -10,6 +10,7 @@ import com.search.lucene.file.filters.implementations.TextFileFilter;
import com.search.lucene.indexers.abstractions.IFileIndexer;
import com.search.lucene.settings.Constants;
import com.search.lucene.settings.RepresentationSchema;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
......@@ -24,14 +25,14 @@ public class TextFileIndexer implements IFileIndexer {
private final IndexWriter writer;
private final IFileFilter filter;
public TextFileIndexer(String indexDirectoryPath) throws IOException {
public TextFileIndexer(String indexDirectoryPath, PerFieldAnalyzerWrapper wrapper) throws IOException {
this.filter = new TextFileFilter();
//this directory will contain the indexes
Directory indexDirectory =
FSDirectory.open(Paths.get(indexDirectoryPath));
//create the indexer
writer = new IndexWriter(indexDirectory, new IndexWriterConfig(new StandardAnalyzer()));
writer = new IndexWriter(indexDirectory, new IndexWriterConfig(wrapper));
}
@Override
public void close() throws CorruptIndexException, IOException {
......
......@@ -8,7 +8,9 @@ public class Constants {
public static final String FILE_PATH = "filepath";
public static final String COLUMN = "column";
public static final String VALUE = "value";
public static final String ID = "id";
public static final String ROW = "row";
public static final int MAX_SEARCH = 10;
public static final String DESCRIPTION = "description";
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment