Commit 633148e3 authored by mohamad.alturky's avatar mohamad.alturky

Adding Indexers

parent c5a3a023
<component name="libraryTable">
<library name="apache.pdfbox" type="repository">
<properties maven-id="org.apache.pdfbox:pdfbox:2.0.29" />
<CLASSES>
<root url="jar://$MAVEN_REPOSITORY$/org/apache/pdfbox/pdfbox/2.0.29/pdfbox-2.0.29.jar!/" />
<root url="jar://$MAVEN_REPOSITORY$/org/apache/pdfbox/fontbox/2.0.29/fontbox-2.0.29.jar!/" />
<root url="jar://$MAVEN_REPOSITORY$/commons-logging/commons-logging/1.2/commons-logging-1.2.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</component>
\ No newline at end of file
<component name="libraryTable">
<library name="opencsv" type="repository">
<properties maven-id="com.opencsv:opencsv:4.1" />
<CLASSES>
<root url="jar://$MAVEN_REPOSITORY$/com/opencsv/opencsv/4.1/opencsv-4.1.jar!/" />
<root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-lang3/3.6/commons-lang3-3.6.jar!/" />
<root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-text/1.1/commons-text-1.1.jar!/" />
<root url="jar://$MAVEN_REPOSITORY$/commons-beanutils/commons-beanutils/1.9.3/commons-beanutils-1.9.3.jar!/" />
<root url="jar://$MAVEN_REPOSITORY$/commons-logging/commons-logging/1.2/commons-logging-1.2.jar!/" />
<root url="jar://$MAVEN_REPOSITORY$/commons-collections/commons-collections/3.2.2/commons-collections-3.2.2.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</component>
\ No newline at end of file
This diff is collapsed.
......@@ -12,15 +12,35 @@
<exec.mainClass>com.search.lucene.Lucene</exec.mainClass>
</properties>
<dependencies>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>4.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>9.10.0</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>9.10.0</version>
</dependency>
<dependency>
<groupId>com.search</groupId>
<artifactId>lucene</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
......@@ -3,7 +3,7 @@ package com.search.lucene;
import java.io.IOException;
import com.search.lucene.file.filters.TextFileFilter;
import com.search.lucene.indexers.Indexer;
import com.search.lucene.indexers.TextFileIndexer;
import com.search.lucene.searchers.Searcher;
import com.search.lucene.settings.LuceneConstants;
import org.apache.lucene.document.Document;
......@@ -15,7 +15,7 @@ public class Lucene {
private static final String indexDir = "index";
private static final String dataDir = "data";
private static Indexer indexer;
private static TextFileIndexer indexer;
private static Searcher searcher;
public static void main(String[] args) {
......@@ -31,7 +31,7 @@ public class Lucene {
}
private static void createIndex() throws IOException {
indexer = new Indexer(indexDir);
indexer = new TextFileIndexer(indexDir);
int numIndexed;
long startTime = System.currentTimeMillis();
numIndexed = indexer.createIndex(dataDir, new TextFileFilter());
......
package com.search.lucene.indexers;
import com.opencsv.CSVReader;
import com.search.lucene.indexers.abstractions.IFileIndexer;
import com.search.lucene.settings.LuceneConstants;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
public class CSVFileIndexer implements IFileIndexer {
private final IndexWriter writer;
public CSVFileIndexer(String indexDirectoryPath) throws IOException {
//this directory will contain the indexes
Directory indexDirectory =
FSDirectory.open(Paths.get(indexDirectoryPath));
//create the indexer
writer = new IndexWriter(indexDirectory, new IndexWriterConfig(new StandardAnalyzer()));
}
public void close() throws CorruptIndexException, IOException {
writer.close();
}
private ArrayList<Document> getDocument(File file) throws IOException {
ArrayList<Document> documents = new ArrayList<>();
FileReader filereader = new FileReader(file);
CSVReader csvReader = new CSVReader(filereader);
String[] nextRecord;
String[] columns;
columns = csvReader.readNext();
int line = 1;
while ((nextRecord = csvReader.readNext()) != null) {
line++;
for (int i = 0; i < nextRecord.length; i++) {
Document document = new Document();
document.add(new TextField(LuceneConstants.COLUMN, columns[i], Field.Store.YES));
document.add(new TextField(LuceneConstants.VALUE, nextRecord[i], Field.Store.YES));
document.add(new TextField(LuceneConstants.LINE, Integer.toString(line), Field.Store.YES));
document.add(new StringField(LuceneConstants.FILE_PATH, file.getCanonicalPath(), Field.Store.YES));
document.add(new StringField(LuceneConstants.FILE_NAME, file.getName(), Field.Store.YES));
documents.add(document);
System.out.print(nextRecord[i] + "\t");
}
System.out.println();
}
return documents;
}
private void indexFile(File file) throws IOException {
System.out.println("Indexing " + file.getCanonicalPath());
ArrayList<Document> documents = getDocument(file);
for (var document : documents) {
writer.addDocument(document);
}
}
@Override
public int createIndex(String dataDirectoryPath, FileFilter filter)
throws IOException {
File[] files = new File(dataDirectoryPath).listFiles();
assert files != null;
for (File file : files) {
if (!file.isDirectory()
&& !file.isHidden()
&& file.exists()
&& file.canRead()
&& filter.accept(file)
) {
indexFile(file);
}
}
return writer.numRamDocs();
}
}
\ No newline at end of file
package com.search.lucene.indexers;
import com.search.lucene.indexers.abstractions.IFileIndexer;
import com.search.lucene.settings.LuceneConstants;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
public class PDFFileIndexer implements IFileIndexer {
private final IndexWriter writer;
public PDFFileIndexer(String indexDirectoryPath) throws IOException {
//this directory will contain the indexes
Directory indexDirectory =
FSDirectory.open(Paths.get(indexDirectoryPath));
//create the indexer
writer = new IndexWriter(indexDirectory, new IndexWriterConfig(new StandardAnalyzer()));
}
public void close() throws CorruptIndexException, IOException {
writer.close();
}
private Document getDocument(File file) throws IOException {
PDDocument pdDocument = PDDocument.load(file);
String content = new PDFTextStripper().getText(pdDocument);
Document document = new Document();
document.add(new TextField(LuceneConstants.CONTENTS, content, Field.Store.YES));
document.add(new StringField(LuceneConstants.FILE_PATH, file.getCanonicalPath(), Field.Store.YES));
document.add(new StringField(LuceneConstants.FILE_NAME, file.getName(), Field.Store.YES));
writer.addDocument(document);
pdDocument.close();
return document;
}
private void indexFile(File file) throws IOException {
System.out.println("Indexing "+file.getCanonicalPath());
Document document = getDocument(file);
writer.addDocument(document);
}
@Override
public int createIndex(String dataDirectoryPath, FileFilter filter)
throws IOException {
File[] files = new File(dataDirectoryPath).listFiles();
assert files != null;
for (File file : files) {
if(!file.isDirectory()
&& !file.isHidden()
&& file.exists()
&& file.canRead()
&& filter.accept(file)
){
indexFile(file);
}
}
return writer.numRamDocs();
}
}
......@@ -6,6 +6,7 @@ import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import com.search.lucene.indexers.abstractions.IFileIndexer;
import com.search.lucene.settings.LuceneConstants;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
......@@ -16,11 +17,10 @@ import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Indexer {
public class TextFileIndexer implements IFileIndexer {
private final IndexWriter writer;
private IndexWriter writer;
public Indexer(String indexDirectoryPath) throws IOException {
public TextFileIndexer(String indexDirectoryPath) throws IOException {
//this directory will contain the indexes
Directory indexDirectory =
FSDirectory.open(Paths.get(indexDirectoryPath));
......@@ -57,9 +57,12 @@ public class Indexer {
writer.addDocument(document);
}
public int createIndex(String dataDirPath, FileFilter filter)
@Override
public int createIndex(String dataDirectoryPath, FileFilter filter)
throws IOException {
File[] files = new File(dataDirPath).listFiles();
File[] files = new File(dataDirectoryPath).listFiles();
assert files != null;
for (File file : files) {
if(!file.isDirectory()
......
package com.search.lucene.indexers.abstractions;
import java.io.FileFilter;
import java.io.IOException;
public interface IFileIndexer {
int createIndex(String dataDirectoryPath, FileFilter filter) throws IOException;
}
......@@ -5,6 +5,9 @@ public class LuceneConstants {
public static final String CONTENTS = "contents";
public static final String FILE_NAME = "filename";
public static final String FILE_PATH = "filepath";
public static final String COLUMN = "column";
public static final String VALUE = "value";
public static final String LINE = "line";
public static final int MAX_SEARCH = 10;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment