Commit 82c3cdab authored by mohamad.alturky's avatar mohamad.alturky

adjust engine, add representers resolver and bug fixing

parent b0aea8ef
......@@ -2,9 +2,8 @@ package com.search.lucene;
import java.io.IOException;
import com.search.lucene.file.filters.implementations.TextFileFilter;
import com.search.lucene.indexers.implementations.TextFileIndexer;
import com.search.lucene.searchers.implementations.TextFileSearcher;
import com.search.lucene.searchers.implementations.IndexedDocumentSearcher;
import com.search.lucene.settings.Constants;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryparser.classic.ParseException;
......@@ -16,7 +15,7 @@ public class Lucene {
private static final String indexDir = "index";
private static final String dataDir = "data";
private static TextFileIndexer indexer;
private static TextFileSearcher textFileSearcher;
private static IndexedDocumentSearcher textFileSearcher;
public static void main(String[] args) {
try {
......@@ -42,7 +41,7 @@ public class Lucene {
}
private static void search(String searchQuery) throws IOException, ParseException {
textFileSearcher = new TextFileSearcher(indexDir);
textFileSearcher = new IndexedDocumentSearcher(indexDir,Constants.CONTENT);
long startTime = System.currentTimeMillis();
TopDocs hits = textFileSearcher.search(searchQuery);
long endTime = System.currentTimeMillis();
......
......@@ -4,4 +4,5 @@ import org.apache.lucene.document.Document;
public interface IDocumentRepresenter<T> {
T Represent(Document indexedDocument);
String representationSchema();
}
......@@ -2,22 +2,30 @@ package com.search.lucene.documents.representers.implementations;
import com.search.lucene.documents.representers.abstractions.IDocumentRepresenter;
import com.search.lucene.settings.Constants;
import com.search.lucene.settings.RepresentationSchema;
import org.apache.lucene.document.Document;
public class CSVDocumentStringRepresenter implements IDocumentRepresenter<String> {
@Override
public String Represent(Document indexedDocument) {
return String.format("""
file path = %s\s
file name = %s\s
in row = %s\s
in column = %s\s
with content = %s\s
file path is %s\s
file name is %s\s
row number is %s\s
column name is %s\s
content is %s\s
csv row is %s\s
""",
indexedDocument.get(Constants.FILE_PATH),
indexedDocument.get(Constants.FILE_NAME),
indexedDocument.get(Constants.ROW),
indexedDocument.get(Constants.COLUMN),
indexedDocument.get(Constants.VALUE));
indexedDocument.get(Constants.VALUE),
indexedDocument.get(Constants.CSV_ROW));
}
@Override
public String representationSchema() {
return RepresentationSchema.CSV;
}
}
......@@ -2,6 +2,7 @@ package com.search.lucene.documents.representers.implementations;
import com.search.lucene.documents.representers.abstractions.IDocumentRepresenter;
import com.search.lucene.settings.Constants;
import com.search.lucene.settings.RepresentationSchema;
import org.apache.lucene.document.Document;
public class PDFDocumentStringRepresenter implements IDocumentRepresenter<String> {
......@@ -14,6 +15,11 @@ public class PDFDocumentStringRepresenter implements IDocumentRepresenter<String
""",
indexedDocument.get(Constants.FILE_PATH),
indexedDocument.get(Constants.FILE_NAME),
indexedDocument.get(Constants.CONTENTS));
indexedDocument.get(Constants.CONTENT));
}
@Override
public String representationSchema() {
return RepresentationSchema.PDF;
}
}
......@@ -2,6 +2,7 @@ package com.search.lucene.documents.representers.implementations;
import com.search.lucene.documents.representers.abstractions.IDocumentRepresenter;
import com.search.lucene.settings.Constants;
import com.search.lucene.settings.RepresentationSchema;
import org.apache.lucene.document.Document;
public class TextDocumentStringRepresenter implements IDocumentRepresenter<String> {
......@@ -14,6 +15,11 @@ public class TextDocumentStringRepresenter implements IDocumentRepresenter<Strin
""",
indexedDocument.get(Constants.FILE_PATH),
indexedDocument.get(Constants.FILE_NAME),
indexedDocument.get(Constants.CONTENTS));
indexedDocument.get(Constants.CONTENT));
}
@Override
public String representationSchema() {
return RepresentationSchema.TEXT;
}
}
package com.search.lucene.documents.representers.resolver;
import com.search.lucene.documents.representers.abstractions.IDocumentRepresenter;
import com.search.lucene.settings.RepresentationSchema;
import org.apache.lucene.document.Document;
import java.util.HashMap;
public class DocumentRepresenterResolver<T> {
private final HashMap<String,IDocumentRepresenter<T>> hashMap;
public DocumentRepresenterResolver(HashMap<String,IDocumentRepresenter<T>> hashMap) {
this.hashMap = hashMap;
}
public T resolveRepresentation(Document document) {
return hashMap.get(document.get(RepresentationSchema.TYPE)).Represent(document);
}
}
package com.search.lucene.engine.builder.implementations;
import com.search.lucene.documents.representers.resolver.DocumentRepresenterResolver;
import com.search.lucene.engine.abstractions.ISearchEngine;
import com.search.lucene.engine.builder.abstractions.IEngineBuilder;
import com.search.lucene.engine.implementations.LuceneEngine;
......@@ -8,13 +9,14 @@ import com.search.lucene.indexers.implementations.CSVFileIndexer;
import com.search.lucene.indexers.implementations.PDFFileIndexer;
import com.search.lucene.indexers.implementations.TextFileIndexer;
import com.search.lucene.searchers.abstractions.ISearcher;
import com.search.lucene.searchers.implementations.CSVFileSearcher;
import com.search.lucene.searchers.implementations.PDFFileSearcher;
import com.search.lucene.searchers.implementations.TextFileSearcher;
import com.search.lucene.searchers.implementations.IndexedDocumentSearcher;
import com.search.lucene.settings.Constants;
import com.search.lucene.settings.IndexPaths;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
public class LuceneEngineBuilder implements IEngineBuilder {
@Override
......@@ -26,10 +28,9 @@ public class LuceneEngineBuilder implements IEngineBuilder {
indexers.add(new TextFileIndexer(IndexPaths.TEXT_INDEX_FOLDER_PATH));
ArrayList<ISearcher> searchers = new ArrayList<>();
searchers.add(new CSVFileSearcher(IndexPaths.CSV_INDEX_FOLDER_PATH));
searchers.add(new PDFFileSearcher(IndexPaths.PDF_INDEX_FOLDER_PATH));
searchers.add(new TextFileSearcher(IndexPaths.TEXT_INDEX_FOLDER_PATH));
searchers.add(new IndexedDocumentSearcher(IndexPaths.CSV_INDEX_FOLDER_PATH, Constants.VALUE));
searchers.add(new IndexedDocumentSearcher(IndexPaths.PDF_INDEX_FOLDER_PATH,Constants.CONTENT));
searchers.add(new IndexedDocumentSearcher(IndexPaths.TEXT_INDEX_FOLDER_PATH,Constants.CONTENT));
return new LuceneEngine(indexers,searchers);
}
......
package com.search.lucene.engine.implementations;
import com.search.lucene.documents.representers.resolver.DocumentRepresenterResolver;
import com.search.lucene.engine.abstractions.ISearchEngine;
import com.search.lucene.indexers.abstractions.IFileIndexer;
import com.search.lucene.searchers.abstractions.ISearcher;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import java.io.IOException;
import java.util.ArrayList;
......@@ -28,6 +31,15 @@ public class LuceneEngine implements ISearchEngine {
@Override
public ArrayList<Document> search(String searchQuery) throws IOException, ParseException {
return null;
ArrayList<Document> documents = new ArrayList<>();
for (ISearcher searcher : searchers){
TopDocs hits = searcher.search(searchQuery);
for(ScoreDoc scoreDoc : hits.scoreDocs) {
documents.add(searcher.getDocument(scoreDoc));
}
}
return documents;
}
}
......@@ -5,6 +5,7 @@ import com.search.lucene.file.filters.abstractions.IFileFilter;
import com.search.lucene.file.filters.implementations.CSVFileFilter;
import com.search.lucene.indexers.abstractions.IFileIndexer;
import com.search.lucene.settings.Constants;
import com.search.lucene.settings.RepresentationSchema;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
......@@ -17,7 +18,6 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
......@@ -55,15 +55,24 @@ public class CSVFileIndexer implements IFileIndexer {
int line = 1;
while ((nextRecord = csvReader.readNext()) != null) {
line++;
StringBuilder csvRowStringBuilder = new StringBuilder();
for (String s : nextRecord) {
csvRowStringBuilder.append(s);
}
String csvRow = csvRowStringBuilder.toString();
for (int i = 0; i < nextRecord.length; i++) {
Document document = new Document();
document.add(new TextField(Constants.COLUMN, columns[i], Field.Store.YES));
document.add(new TextField(RepresentationSchema.TYPE, RepresentationSchema.CSV, Field.Store.YES));
document.add(new TextField(Constants.CSV_ROW, csvRow, Field.Store.YES));
document.add(new TextField(Constants.VALUE, nextRecord[i], Field.Store.YES));
document.add(new TextField(Constants.ROW, Integer.toString(line), Field.Store.YES));
document.add(new StringField(Constants.FILE_PATH, file.getCanonicalPath(), Field.Store.YES));
document.add(new StringField(Constants.FILE_NAME, file.getName(), Field.Store.YES));
documents.add(document);
System.out.print(nextRecord[i] + "\t");
}
System.out.println();
......
......@@ -4,6 +4,7 @@ import com.search.lucene.file.filters.abstractions.IFileFilter;
import com.search.lucene.file.filters.implementations.PDFFileFilter;
import com.search.lucene.indexers.abstractions.IFileIndexer;
import com.search.lucene.settings.Constants;
import com.search.lucene.settings.RepresentationSchema;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
......@@ -18,7 +19,6 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.nio.file.Paths;
......@@ -44,7 +44,8 @@ public class PDFFileIndexer implements IFileIndexer {
PDDocument pdDocument = PDDocument.load(file);
String content = new PDFTextStripper().getText(pdDocument);
Document document = new Document();
document.add(new TextField(Constants.CONTENTS, content, Field.Store.YES));
document.add(new TextField(Constants.CONTENT, content, Field.Store.YES));
document.add(new TextField(RepresentationSchema.TYPE, RepresentationSchema.PDF, Field.Store.YES));
document.add(new StringField(Constants.FILE_PATH, file.getCanonicalPath(), Field.Store.YES));
document.add(new StringField(Constants.FILE_NAME, file.getName(), Field.Store.YES));
writer.addDocument(document);
......
package com.search.lucene.indexers.implementations;
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
......@@ -10,8 +9,10 @@ import com.search.lucene.file.filters.abstractions.IFileFilter;
import com.search.lucene.file.filters.implementations.TextFileFilter;
import com.search.lucene.indexers.abstractions.IFileIndexer;
import com.search.lucene.settings.Constants;
import com.search.lucene.settings.RepresentationSchema;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
......@@ -40,13 +41,14 @@ public class TextFileIndexer implements IFileIndexer {
private Document getDocument(File file) throws IOException {
Document document = new Document();
TextField contentField = new TextField(Constants.CONTENTS, new FileReader(file));
TextField contentField = new TextField(Constants.CONTENT, new FileReader(file));
TextField fileNameField = new TextField(Constants.FILE_NAME,
file.getName(),TextField.Store.YES);
TextField filePathField = new TextField(Constants.FILE_PATH,
file.getCanonicalPath(),TextField.Store.YES);
document.add(new TextField(RepresentationSchema.TYPE, RepresentationSchema.TEXT, Field.Store.YES));
document.add(contentField);
document.add(fileNameField);
......
......@@ -10,6 +10,6 @@ import java.io.IOException;
public interface ISearcher {
Document getDocument(ScoreDoc scoreDoc) throws CorruptIndexException, IOException;
Document getDocument(ScoreDoc scoreDoc) throws IOException;
TopDocs search(String searchQuery) throws IOException, ParseException;
}
package com.search.lucene.searchers.implementations;
import com.search.lucene.searchers.abstractions.ISearcher;
import com.search.lucene.settings.Constants;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import java.io.IOException;
import java.nio.file.Paths;
public class CSVFileSearcher implements ISearcher {
IndexSearcher indexSearcher;
QueryParser queryParser;
Query query;
public CSVFileSearcher(String indexDirectoryPath)
throws IOException {
Directory indexDirectory =
FSDirectory.open(Paths.get(indexDirectoryPath));
IndexReader reader = DirectoryReader.open(indexDirectory);
indexSearcher = new IndexSearcher(reader);
queryParser = new QueryParser(Constants.CONTENTS,
new StandardAnalyzer());
}
@Override
public TopDocs search( String searchQuery)
throws IOException, ParseException {
query = queryParser.parse(searchQuery);
return indexSearcher.search(query, Constants.MAX_SEARCH);
}
@Override
public Document getDocument(ScoreDoc scoreDoc)
throws CorruptIndexException, IOException {
return indexSearcher.doc(scoreDoc.doc);
}
}
\ No newline at end of file
package com.search.lucene.searchers.implementations;
import java.io.IOException;
import java.nio.file.Paths;
import com.search.lucene.documents.representers.abstractions.IDocumentRepresenter;
import com.search.lucene.searchers.abstractions.ISearcher;
import com.search.lucene.settings.Constants;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
......@@ -16,29 +20,26 @@ import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import java.io.IOException;
import java.nio.file.Paths;
public class IndexedDocumentSearcher implements ISearcher {
public class PDFFileSearcher implements ISearcher {
IndexSearcher indexSearcher;
QueryParser queryParser;
Query query;
private final IndexSearcher indexSearcher;
private final QueryParser queryParser;
public PDFFileSearcher(String indexDirectoryPath)
public IndexedDocumentSearcher(String indexDirectoryPath, String queryForField)
throws IOException {
Directory indexDirectory =
FSDirectory.open(Paths.get(indexDirectoryPath));
IndexReader reader = DirectoryReader.open(indexDirectory);
indexSearcher = new IndexSearcher(reader);
queryParser = new QueryParser(Constants.CONTENTS,
queryParser = new QueryParser(queryForField,
new StandardAnalyzer());
}
@Override
public TopDocs search(String searchQuery)
throws IOException, ParseException {
query = queryParser.parse(searchQuery);
Query query = queryParser.parse(searchQuery);
return indexSearcher.search(query, Constants.MAX_SEARCH);
}
@Override
......@@ -46,5 +47,4 @@ public class PDFFileSearcher implements ISearcher {
throws CorruptIndexException, IOException {
return indexSearcher.doc(scoreDoc.doc);
}
}
\ No newline at end of file
package com.search.lucene.searchers.implementations;
import java.io.IOException;
import java.nio.file.Paths;
import com.search.lucene.searchers.abstractions.ISearcher;
import com.search.lucene.settings.Constants;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class TextFileSearcher implements ISearcher {
IndexSearcher indexSearcher;
QueryParser queryParser;
Query query;
public TextFileSearcher(String indexDirectoryPath)
throws IOException {
Directory indexDirectory =
FSDirectory.open(Paths.get(indexDirectoryPath));
IndexReader reader = DirectoryReader.open(indexDirectory);
indexSearcher = new IndexSearcher(reader);
queryParser = new QueryParser(Constants.CONTENTS,
new StandardAnalyzer());
}
@Override
public TopDocs search(String searchQuery)
throws IOException, ParseException {
query = queryParser.parse(searchQuery);
return indexSearcher.search(query, Constants.MAX_SEARCH);
}
@Override
public Document getDocument(ScoreDoc scoreDoc)
throws CorruptIndexException, IOException {
return indexSearcher.doc(scoreDoc.doc);
}
}
\ No newline at end of file
......@@ -2,8 +2,8 @@ package com.search.lucene.settings;
public class Constants {
public static final String CONTENTS = "contents";
public static final String INDEXED_FROM_FILE_FORMAT = "from_format";
public static final String CONTENT = "content";
public static final String CSV_ROW = "csv_row";
public static final String FILE_NAME = "filename";
public static final String FILE_PATH = "filepath";
public static final String COLUMN = "column";
......
package com.search.lucene.settings;
public final class IndexType {
public final class RepresentationSchema {
public static final String TYPE = "type";
public static final String CSV = "CSV";
public static final String PDF = "PDF";
public static final String TEXT = "TEXT";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment