Commit c739ad57 authored by mohamad.alturky's avatar mohamad.alturky

Initial commit

parents
#/csvIndex
#/pdfIndex
#/textIndex
#/data
/target
\ No newline at end of file
# Default ignored files
/shelf/
/workspace.xml
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CompilerConfiguration">
<annotationProcessing>
<profile name="Maven default annotation processors profile" enabled="true">
<sourceOutputDir name="target/generated-sources/annotations" />
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
<outputRelativeToContentRoot value="true" />
<module name="lucene" />
</profile>
</annotationProcessing>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="file://$PROJECT_DIR$/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/src/main/resources" charset="UTF-8" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RemoteRepositoriesConfiguration">
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Central Repository" />
<option name="url" value="https://repo.maven.apache.org/maven2" />
</remote-repository>
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Maven Central repository" />
<option name="url" value="https://repo1.maven.org/maven2" />
</remote-repository>
<remote-repository>
<option name="id" value="jboss.community" />
<option name="name" value="JBoss Community repository" />
<option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" />
</remote-repository>
</component>
</project>
\ No newline at end of file
<component name="libraryTable">
<library name="apache.pdfbox" type="repository">
<properties maven-id="org.apache.pdfbox:pdfbox:2.0.29" />
<CLASSES>
<root url="jar://$MAVEN_REPOSITORY$/org/apache/pdfbox/pdfbox/2.0.29/pdfbox-2.0.29.jar!/" />
<root url="jar://$MAVEN_REPOSITORY$/org/apache/pdfbox/fontbox/2.0.29/fontbox-2.0.29.jar!/" />
<root url="jar://$MAVEN_REPOSITORY$/commons-logging/commons-logging/1.2/commons-logging-1.2.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</component>
\ No newline at end of file
<component name="libraryTable">
<library name="opencsv" type="repository">
<properties maven-id="com.opencsv:opencsv:4.1" />
<CLASSES>
<root url="jar://$MAVEN_REPOSITORY$/com/opencsv/opencsv/4.1/opencsv-4.1.jar!/" />
<root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-lang3/3.6/commons-lang3-3.6.jar!/" />
<root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-text/1.1/commons-text-1.1.jar!/" />
<root url="jar://$MAVEN_REPOSITORY$/commons-beanutils/commons-beanutils/1.9.3/commons-beanutils-1.9.3.jar!/" />
<root url="jar://$MAVEN_REPOSITORY$/commons-logging/commons-logging/1.2/commons-logging-1.2.jar!/" />
<root url="jar://$MAVEN_REPOSITORY$/commons-collections/commons-collections/3.2.2/commons-collections-3.2.2.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
</list>
</option>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_21" default="true" project-jdk-name="21" project-jdk-type="JavaSDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Palette2">
<group name="Swing">
<item class="com.intellij.uiDesigner.HSpacer" tooltip-text="Horizontal Spacer" icon="/com/intellij/uiDesigner/icons/hspacer.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="1" hsize-policy="6" anchor="0" fill="1" />
</item>
<item class="com.intellij.uiDesigner.VSpacer" tooltip-text="Vertical Spacer" icon="/com/intellij/uiDesigner/icons/vspacer.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="1" anchor="0" fill="2" />
</item>
<item class="javax.swing.JPanel" icon="/com/intellij/uiDesigner/icons/panel.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3" />
</item>
<item class="javax.swing.JScrollPane" icon="/com/intellij/uiDesigner/icons/scrollPane.svg" removable="false" auto-create-binding="false" can-attach-label="true">
<default-constraints vsize-policy="7" hsize-policy="7" anchor="0" fill="3" />
</item>
<item class="javax.swing.JButton" icon="/com/intellij/uiDesigner/icons/button.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="0" fill="1" />
<initial-values>
<property name="text" value="Button" />
</initial-values>
</item>
<item class="javax.swing.JRadioButton" icon="/com/intellij/uiDesigner/icons/radioButton.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
<initial-values>
<property name="text" value="RadioButton" />
</initial-values>
</item>
<item class="javax.swing.JCheckBox" icon="/com/intellij/uiDesigner/icons/checkBox.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
<initial-values>
<property name="text" value="CheckBox" />
</initial-values>
</item>
<item class="javax.swing.JLabel" icon="/com/intellij/uiDesigner/icons/label.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="0" anchor="8" fill="0" />
<initial-values>
<property name="text" value="Label" />
</initial-values>
</item>
<item class="javax.swing.JTextField" icon="/com/intellij/uiDesigner/icons/textField.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JPasswordField" icon="/com/intellij/uiDesigner/icons/passwordField.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JFormattedTextField" icon="/com/intellij/uiDesigner/icons/formattedTextField.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JTextArea" icon="/com/intellij/uiDesigner/icons/textArea.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTextPane" icon="/com/intellij/uiDesigner/icons/textPane.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JEditorPane" icon="/com/intellij/uiDesigner/icons/editorPane.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JComboBox" icon="/com/intellij/uiDesigner/icons/comboBox.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="2" anchor="8" fill="1" />
</item>
<item class="javax.swing.JTable" icon="/com/intellij/uiDesigner/icons/table.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JList" icon="/com/intellij/uiDesigner/icons/list.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="2" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTree" icon="/com/intellij/uiDesigner/icons/tree.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTabbedPane" icon="/com/intellij/uiDesigner/icons/tabbedPane.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
<preferred-size width="200" height="200" />
</default-constraints>
</item>
<item class="javax.swing.JSplitPane" icon="/com/intellij/uiDesigner/icons/splitPane.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
<preferred-size width="200" height="200" />
</default-constraints>
</item>
<item class="javax.swing.JSpinner" icon="/com/intellij/uiDesigner/icons/spinner.svg" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
</item>
<item class="javax.swing.JSlider" icon="/com/intellij/uiDesigner/icons/slider.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
</item>
<item class="javax.swing.JSeparator" icon="/com/intellij/uiDesigner/icons/separator.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3" />
</item>
<item class="javax.swing.JProgressBar" icon="/com/intellij/uiDesigner/icons/progressbar.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1" />
</item>
<item class="javax.swing.JToolBar" icon="/com/intellij/uiDesigner/icons/toolbar.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1">
<preferred-size width="-1" height="20" />
</default-constraints>
</item>
<item class="javax.swing.JToolBar$Separator" icon="/com/intellij/uiDesigner/icons/toolbarSeparator.svg" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="0" anchor="0" fill="1" />
</item>
<item class="javax.swing.JScrollBar" icon="/com/intellij/uiDesigner/icons/scrollbar.svg" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="0" anchor="0" fill="2" />
</item>
</group>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>
\ No newline at end of file
# HIAST Search Engine
> Language : Java
> <br/>
> Libraries : Lucene
> <br/>
## Features
> We can Index and search this files formats
> <br/>
>> csv
>> <br/>
>> txt
>> <br/>
>> pdf
## Project Packages Structure
> <div style="color:#9215a0">settings</div> <div style="color:#3b9636">contains program setting and configuration.</div>
> <div style="color:#9215a0">filters</div> <div style="color:#3b9636">contains the file filters to know the target file extension.</div>
> <div style="color:#9215a0">indexers</div> <div style="color:#3b9636">contains the indexers for csv, pdf and txt file extension.</div>
> <div style="color:#9215a0">searchers</div> <div style="color:#3b9636">contains the one class to search in the indexed files.</div>
> <div style="color:#9215a0">representers</div> <div style="color:#3b9636">the return type of the searcher is a list of documents each one encoded with some structure so we have a staregy for each type of encoding to decode</div>
> <div style="color:#9215a0">representer resolver</div> <div style="color:#3b9636">gets a document and return the appropriate representation for this document</div>
> <div style="color:#9215a0">benchmark class</div> <div style="color:#3b9636">caluclates the time consumed to execute a function</div>
> <div style="color:#9215a0">search engine</div> <div style="color:#3b9636">encapsulates the indexing and searching process. it contains a list of the indexers each one knows how to index a file and have the searcher to search for a query the output of it will be forwarded to the representers resolver to get the appropriate representation.</div>
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,25-Sep-21,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable."
\ No newline at end of file
s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng",South Africa,24-Sep-21,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth."
\ No newline at end of file
s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera",,24-Sep-21,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Action & Adventure","To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war."
\ No newline at end of file
s4,TV Show,Jailbirds New Orleans,,,,24-Sep-21,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series."
\ No newline at end of file
s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar",India,24-Sep-21,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV Comedies","In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life."
\ No newline at end of file
s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver",,24-Sep-21,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries","The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town desperate to believe."
\ No newline at end of file
s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr",,24-Sep-21,2021,PG,91 min,Children & Family Movies,"Equestria's divided. But a bright-eyed hero believes Earth Ponies, Pegasi and Unicorns should be pals — and, hoof to heart, she’s determined to prove it."
\ No newline at end of file
s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri","United States, Ghana, Burkina Faso, United Kingdom, Germany, Ethiopia",24-Sep-21,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model slips back in time, becomes enslaved on a plantation and bears witness to the agony of her ancestral past."
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
File added
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.search</groupId>
<artifactId>lucene</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<exec.mainClass>com.search.lucene.Lucene</exec.mainClass>
</properties>
<dependencies>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>4.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>9.10.0</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>9.10.0</version>
</dependency>
<dependency>
<groupId>com.search</groupId>
<artifactId>lucene</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package com.search.lucene;
import java.io.IOException;
import java.util.ArrayList;
import com.search.lucene.documents.representers.resolver.DocumentToStringRepresenterResolver;
import com.search.lucene.documents.representers.resolver.IDocumentRepresenterResolver;
import com.search.lucene.engine.abstractions.ISearchEngine;
import com.search.lucene.engine.builder.abstractions.IEngineBuilder;
import com.search.lucene.engine.builder.implementations.LuceneEngineBuilder;
import com.search.lucene.indexers.implementations.TextFileIndexer;
import com.search.lucene.performance.Benchmarker;
import com.search.lucene.searchers.implementations.IndexedDocumentSearcher;
import com.search.lucene.settings.Constants;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
public class Lucene {
public static void main(String[] args) throws Exception {
IEngineBuilder builder = new LuceneEngineBuilder();
ISearchEngine searchEngine = builder.build();
Benchmarker.benchmark(() -> {
searchEngine.createIndexesForDirectory("data");
});
ArrayList<Document> results = searchEngine.search("Julien Leclercq");
IDocumentRepresenterResolver<String> resolver = new DocumentToStringRepresenterResolver();
System.out.println();
for (var result : results) {
System.out.println(resolver.resolveRepresentation(result));
}
}
// private static final String indexDir = "index";
// private static final String dataDir = "data";
// private static TextFileIndexer indexer;
// private static IndexedDocumentSearcher textFileSearcher;
//
// public static void main(String[] args) {
// try {
// createIndex();
// search("Kota");
// search("s");
// } catch (IOException e) {
// e.printStackTrace();
// } catch (ParseException e) {
// e.printStackTrace();
// }
// }
//
// private static void createIndex() throws IOException {
// indexer = new TextFileIndexer(indexDir);
// int numIndexed;
// long startTime = System.currentTimeMillis();
// numIndexed = indexer.createIndex(dataDir);
// long endTime = System.currentTimeMillis();
// indexer.close();
// System.out.println(numIndexed+" File indexed, time taken: "
// +(endTime-startTime)+" ms");
// }
//
// private static void search(String searchQuery) throws IOException, ParseException {
// textFileSearcher = new IndexedDocumentSearcher(indexDir,Constants.CONTENT);
// long startTime = System.currentTimeMillis();
// TopDocs hits = textFileSearcher.search(searchQuery);
// long endTime = System.currentTimeMillis();
//
// System.out.println(hits.totalHits +
// " documents found. Time :" + (endTime - startTime));
// for(ScoreDoc scoreDoc : hits.scoreDocs) {
// Document doc = textFileSearcher.getDocument(scoreDoc);
// System.out.println("File: "
// + doc.get(Constants.FILE_PATH));
// }
// }
}
package com.search.lucene.documents.representers.abstractions;
import org.apache.lucene.document.Document;
public interface IDocumentRepresenter<T> {
T Represent(Document indexedDocument);
String representationSchema();
}
package com.search.lucene.documents.representers.implementations;
import com.search.lucene.documents.representers.abstractions.IDocumentRepresenter;
import com.search.lucene.settings.Constants;
import com.search.lucene.settings.RepresentationSchema;
import org.apache.lucene.document.Document;
public class CSVDocumentStringRepresenter implements IDocumentRepresenter<String> {
@Override
public String Represent(Document indexedDocument) {
return String.format("""
file path is %s\s
file name is %s\s
row number is %s\s
column name is %s\s
content is %s\s
csv row is %s\s
""",
indexedDocument.get(Constants.FILE_PATH),
indexedDocument.get(Constants.FILE_NAME),
indexedDocument.get(Constants.ROW),
indexedDocument.get(Constants.COLUMN),
indexedDocument.get(Constants.VALUE),
indexedDocument.get(Constants.CSV_ROW));
}
@Override
public String representationSchema() {
return RepresentationSchema.CSV;
}
}
package com.search.lucene.documents.representers.implementations;
import com.search.lucene.documents.representers.abstractions.IDocumentRepresenter;
import com.search.lucene.settings.Constants;
import com.search.lucene.settings.RepresentationSchema;
import org.apache.lucene.document.Document;
public class PDFDocumentStringRepresenter implements IDocumentRepresenter<String> {
@Override
public String Represent(Document indexedDocument) {
return String.format("""
file path = %s\s
file name = %s\s
with content = %s\s
""",
indexedDocument.get(Constants.FILE_PATH),
indexedDocument.get(Constants.FILE_NAME),
indexedDocument.get(Constants.CONTENT));
}
@Override
public String representationSchema() {
return RepresentationSchema.PDF;
}
}
package com.search.lucene.documents.representers.implementations;
import com.search.lucene.documents.representers.abstractions.IDocumentRepresenter;
import com.search.lucene.settings.Constants;
import com.search.lucene.settings.RepresentationSchema;
import org.apache.lucene.document.Document;
public class TextDocumentStringRepresenter implements IDocumentRepresenter<String> {
@Override
public String Represent(Document indexedDocument) {
return String.format("""
file path = %s\s
file name = %s\s
with content = %s\s
""",
indexedDocument.get(Constants.FILE_PATH),
indexedDocument.get(Constants.FILE_NAME),
indexedDocument.get(Constants.CONTENT));
}
@Override
public String representationSchema() {
return RepresentationSchema.TEXT;
}
}
package com.search.lucene.documents.representers.resolver;
import com.search.lucene.documents.representers.abstractions.IDocumentRepresenter;
import com.search.lucene.documents.representers.implementations.CSVDocumentStringRepresenter;
import com.search.lucene.documents.representers.implementations.PDFDocumentStringRepresenter;
import com.search.lucene.documents.representers.implementations.TextDocumentStringRepresenter;
import com.search.lucene.settings.RepresentationSchema;
import org.apache.lucene.document.Document;
import java.util.HashMap;
public class DocumentToStringRepresenterResolver implements IDocumentRepresenterResolver<String>{
private final HashMap<String,IDocumentRepresenter<String>> hashMap;
public DocumentToStringRepresenterResolver() {
hashMap = new HashMap<>();
hashMap.put(RepresentationSchema.CSV,new CSVDocumentStringRepresenter());
hashMap.put(RepresentationSchema.TEXT,new TextDocumentStringRepresenter());
hashMap.put(RepresentationSchema.PDF,new PDFDocumentStringRepresenter());
}
public String resolveRepresentation(Document document) {
return hashMap.get(document.get(RepresentationSchema.TYPE)).Represent(document);
}
}
package com.search.lucene.documents.representers.resolver;
import org.apache.lucene.document.Document;
public interface IDocumentRepresenterResolver<T> {
T resolveRepresentation(Document document);
}
package com.search.lucene.engine.abstractions;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryparser.classic.ParseException;
import java.io.IOException;
import java.util.ArrayList;
public interface ISearchEngine {
void createIndexesForDirectory(String directoryPath) throws IOException;
ArrayList<Document> search(String searchQuery) throws IOException, ParseException;
}
package com.search.lucene.engine.builder.abstractions;
import com.search.lucene.engine.abstractions.ISearchEngine;
import java.io.IOException;
public interface IEngineBuilder {
ISearchEngine build() throws IOException;
}
package com.search.lucene.engine.builder.implementations;
import com.search.lucene.engine.abstractions.ISearchEngine;
import com.search.lucene.engine.builder.abstractions.IEngineBuilder;
import com.search.lucene.engine.implementations.LuceneEngine;
import com.search.lucene.indexers.abstractions.IFileIndexer;
import com.search.lucene.indexers.implementations.CSVFileIndexer;
import com.search.lucene.indexers.implementations.PDFFileIndexer;
import com.search.lucene.indexers.implementations.TextFileIndexer;
import com.search.lucene.searchers.abstractions.ISearcher;
import com.search.lucene.searchers.implementations.IndexedDocumentSearcher;
import com.search.lucene.settings.Constants;
import com.search.lucene.settings.IndexPaths;
import java.io.IOException;
import java.util.ArrayList;
public class LuceneEngineBuilder implements IEngineBuilder {
@Override
public ISearchEngine build() throws IOException {
ArrayList<IFileIndexer> indexers = new ArrayList<>();
indexers.add(new CSVFileIndexer(IndexPaths.CSV_INDEX_FOLDER_PATH));
indexers.add(new PDFFileIndexer(IndexPaths.PDF_INDEX_FOLDER_PATH));
indexers.add(new TextFileIndexer(IndexPaths.TEXT_INDEX_FOLDER_PATH));
ArrayList<ISearcher> searchers = new ArrayList<>();
searchers.add(new IndexedDocumentSearcher(IndexPaths.CSV_INDEX_FOLDER_PATH, Constants.VALUE));
searchers.add(new IndexedDocumentSearcher(IndexPaths.PDF_INDEX_FOLDER_PATH,Constants.CONTENT));
searchers.add(new IndexedDocumentSearcher(IndexPaths.TEXT_INDEX_FOLDER_PATH,Constants.CONTENT));
return new LuceneEngine(indexers,searchers);
}
}
package com.search.lucene.engine.implementations;
import com.search.lucene.engine.abstractions.ISearchEngine;
import com.search.lucene.indexers.abstractions.IFileIndexer;
import com.search.lucene.searchers.abstractions.ISearcher;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import java.io.IOException;
import java.util.ArrayList;
public class LuceneEngine implements ISearchEngine {
private final ArrayList<IFileIndexer> indexers;
private final ArrayList<ISearcher> searchers;
public LuceneEngine(ArrayList<IFileIndexer> indexers, ArrayList<ISearcher> searchers) {
this.indexers = indexers;
this.searchers = searchers;
}
@Override
public void createIndexesForDirectory(String directoryPath) throws IOException {
for (IFileIndexer indexer : indexers) {
System.out.println("Starting indexing " + indexer.getClass().getName());
indexer.createIndex(directoryPath);
indexer.close();
}
}
@Override
public ArrayList<Document> search(String searchQuery) throws IOException, ParseException {
ArrayList<Document> documents = new ArrayList<>();
for (ISearcher searcher : searchers) {
searcher.init();
TopDocs hits = searcher.search(searchQuery);
for(ScoreDoc scoreDoc : hits.scoreDocs) {
documents.add(searcher.getDocument(scoreDoc));
}
}
return documents;
}
}
package com.search.lucene.file.filters.abstractions;
import java.io.File;
import java.io.FileFilter;
public interface IFileFilter extends FileFilter{
@Override
boolean accept(File pathname);
}
package com.search.lucene.file.filters.implementations;
import com.search.lucene.file.filters.abstractions.IFileFilter;
import java.io.File;
public class CSVFileFilter implements IFileFilter {
@Override
public boolean accept(File pathname) {
return pathname.getName().toLowerCase().endsWith(".csv");
}
}
\ No newline at end of file
package com.search.lucene.file.filters.implementations;
import com.search.lucene.file.filters.abstractions.IFileFilter;
import java.io.File;
public class PDFFileFilter implements IFileFilter {
@Override
public boolean accept(File pathname) {
return pathname.getName().toLowerCase().endsWith(".pdf");
}
}
\ No newline at end of file
package com.search.lucene.file.filters.implementations;
import com.search.lucene.file.filters.abstractions.IFileFilter;
import java.io.File;
public class TextFileFilter implements IFileFilter {
@Override
public boolean accept(File pathname) {
return pathname.getName().toLowerCase().endsWith(".txt");
}
}
\ No newline at end of file
package com.search.lucene.indexers.abstractions;
import org.apache.lucene.index.CorruptIndexException;
import java.io.FileFilter;
import java.io.IOException;
public interface IFileIndexer {
int createIndex(String dataDirectoryPath) throws IOException;
void close() throws CorruptIndexException, IOException;
}
package com.search.lucene.indexers.implementations;
import com.opencsv.CSVReader;
import com.search.lucene.file.filters.abstractions.IFileFilter;
import com.search.lucene.file.filters.implementations.CSVFileFilter;
import com.search.lucene.indexers.abstractions.IFileIndexer;
import com.search.lucene.settings.Constants;
import com.search.lucene.settings.RepresentationSchema;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
public class CSVFileIndexer implements IFileIndexer {
private final IndexWriter writer;
private final IFileFilter filter;
public CSVFileIndexer(String indexDirectoryPath) throws IOException {
this.filter = new CSVFileFilter();
//this directory will contain the indexes
Directory indexDirectory =
FSDirectory.open(Paths.get(indexDirectoryPath));
//create the indexer
writer = new IndexWriter(indexDirectory, new IndexWriterConfig(new StandardAnalyzer()));
}
@Override
public void close() throws CorruptIndexException, IOException {
writer.close();
}
private ArrayList<Document> getDocument(File file) throws IOException {
ArrayList<Document> documents = new ArrayList<>();
FileReader filereader = new FileReader(file);
CSVReader csvReader = new CSVReader(filereader);
String[] nextRecord;
String[] columns;
columns = csvReader.readNext();
int line = 1;
while ((nextRecord = csvReader.readNext()) != null) {
line++;
StringBuilder csvRowStringBuilder = new StringBuilder();
for (String s : nextRecord) {
csvRowStringBuilder.append(s);
}
String csvRow = csvRowStringBuilder.toString();
for (int i = 0; i < nextRecord.length; i++) {
Document document = new Document();
document.add(new TextField(Constants.COLUMN, columns[i], Field.Store.YES));
document.add(new TextField(RepresentationSchema.TYPE, RepresentationSchema.CSV, Field.Store.YES));
document.add(new TextField(Constants.CSV_ROW, csvRow, Field.Store.YES));
document.add(new TextField(Constants.VALUE, nextRecord[i], Field.Store.YES));
document.add(new TextField(Constants.ROW, Integer.toString(line), Field.Store.YES));
document.add(new StringField(Constants.FILE_PATH, file.getCanonicalPath(), Field.Store.YES));
document.add(new StringField(Constants.FILE_NAME, file.getName(), Field.Store.YES));
documents.add(document);
}
}
return documents;
}
private void indexFile(File file) throws IOException {
ArrayList<Document> documents = getDocument(file);
for (var document : documents) {
writer.addDocument(document);
}
}
@Override
public int createIndex(String dataDirectoryPath)
throws IOException {
File[] files = new File(dataDirectoryPath).listFiles();
if (files == null){
return -1000;
}
for (File file : files) {
if (!file.isDirectory()
&& !file.isHidden()
&& file.exists()
&& file.canRead()
&& filter.accept(file)
) {
indexFile(file);
}
}
return writer.numRamDocs();
}
}
\ No newline at end of file
package com.search.lucene.indexers.implementations;
import com.search.lucene.file.filters.abstractions.IFileFilter;
import com.search.lucene.file.filters.implementations.PDFFileFilter;
import com.search.lucene.indexers.abstractions.IFileIndexer;
import com.search.lucene.settings.Constants;
import com.search.lucene.settings.RepresentationSchema;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
public class PDFFileIndexer implements IFileIndexer {
private final IndexWriter writer;
private final IFileFilter filter;
public PDFFileIndexer(String indexDirectoryPath) throws IOException {
//this directory will contain the indexes
Directory indexDirectory =
FSDirectory.open(Paths.get(indexDirectoryPath));
this.filter = new PDFFileFilter();
//create the indexer
writer = new IndexWriter(indexDirectory, new IndexWriterConfig(new StandardAnalyzer()));
}
@Override
public void close() throws CorruptIndexException, IOException {
writer.close();
}
private Document getDocument(File file) throws IOException {
PDDocument pdDocument = PDDocument.load(file);
String content = new PDFTextStripper().getText(pdDocument);
Document document = new Document();
document.add(new TextField(Constants.CONTENT, content, Field.Store.YES));
document.add(new TextField(RepresentationSchema.TYPE, RepresentationSchema.PDF, Field.Store.YES));
document.add(new StringField(Constants.FILE_PATH, file.getCanonicalPath(), Field.Store.YES));
document.add(new StringField(Constants.FILE_NAME, file.getName(), Field.Store.YES));
writer.addDocument(document);
pdDocument.close();
return document;
}
private void indexFile(File file) throws IOException {
Document document = getDocument(file);
writer.addDocument(document);
}
@Override
public int createIndex(String dataDirectoryPath)
throws IOException {
File[] files = new File(dataDirectoryPath).listFiles();
assert files != null;
for (File file : files) {
if(!file.isDirectory()
&& !file.isHidden()
&& file.exists()
&& file.canRead()
&& filter.accept(file)
){
indexFile(file);
}
}
return writer.numRamDocs();
}
}
package com.search.lucene.indexers.implementations;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import com.search.lucene.file.filters.abstractions.IFileFilter;
import com.search.lucene.file.filters.implementations.TextFileFilter;
import com.search.lucene.indexers.abstractions.IFileIndexer;
import com.search.lucene.settings.Constants;
import com.search.lucene.settings.RepresentationSchema;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class TextFileIndexer implements IFileIndexer {
private final IndexWriter writer;
private final IFileFilter filter;
public TextFileIndexer(String indexDirectoryPath) throws IOException {
this.filter = new TextFileFilter();
//this directory will contain the indexes
Directory indexDirectory =
FSDirectory.open(Paths.get(indexDirectoryPath));
//create the indexer
writer = new IndexWriter(indexDirectory, new IndexWriterConfig(new StandardAnalyzer()));
}
@Override
public void close() throws CorruptIndexException, IOException {
writer.close();
}
private Document getDocument(File file) throws IOException {
Document document = new Document();
TextField contentField = new TextField(Constants.CONTENT, new FileReader(file));
TextField fileNameField = new TextField(Constants.FILE_NAME,
file.getName(),TextField.Store.YES);
TextField filePathField = new TextField(Constants.FILE_PATH,
file.getCanonicalPath(),TextField.Store.YES);
document.add(new TextField(RepresentationSchema.TYPE, RepresentationSchema.TEXT, Field.Store.YES));
document.add(contentField);
document.add(fileNameField);
document.add(filePathField);
return document;
}
private void indexFile(File file) throws IOException {
Document document = getDocument(file);
writer.addDocument(document);
}
@Override
public int createIndex(String dataDirectoryPath)
throws IOException {
File[] files = new File(dataDirectoryPath).listFiles();
assert files != null;
for (File file : files) {
if(!file.isDirectory()
&& !file.isHidden()
&& file.exists()
&& file.canRead()
&& filter.accept(file)
){
indexFile(file);
}
}
return writer.numRamDocs();
}
}
\ No newline at end of file
package com.search.lucene.performance;
import java.util.concurrent.Callable;
import java.util.function.Function;
public class Benchmarker {
public static <T> T benchmark(Callable<T> callable) throws Exception {
long startTime = System.currentTimeMillis();
T t = callable.call();
long endTime = System.currentTimeMillis();
long time = endTime - startTime;
System.out.println("The Function " + callable.getClass().getName() + " takes " + time +" ms");
return t;
}
public static void benchmark(RunnableTask callable) throws Exception {
long startTime = System.currentTimeMillis();
callable.run();
long endTime = System.currentTimeMillis();
long time = endTime - startTime;
System.out.println("The Function " + callable.getClass().getName() + " takes " + time +" ms");
}
}
package com.search.lucene.performance;
public interface RunnableTask {
void run() throws Exception;
}
package com.search.lucene.searchers.abstractions;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import java.io.IOException;
public interface ISearcher {
Document getDocument(ScoreDoc scoreDoc) throws IOException;
TopDocs search(String searchQuery) throws IOException, ParseException;
void init() throws IOException;
}
package com.search.lucene.searchers.implementations;
import java.io.IOException;
import java.nio.file.Paths;
import com.search.lucene.documents.representers.abstractions.IDocumentRepresenter;
import com.search.lucene.searchers.abstractions.ISearcher;
import com.search.lucene.settings.Constants;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class IndexedDocumentSearcher implements ISearcher {
private IndexSearcher indexSearcher;
private QueryParser queryParser;
private final String indexDirectoryPath;
private final String queryForField;
public IndexedDocumentSearcher(String indexDirectoryPath, String queryForField)
throws IOException {
this.indexDirectoryPath = indexDirectoryPath;
this.queryForField = queryForField;
}
@Override
public TopDocs search(String searchQuery)
throws IOException, ParseException {
Query query = queryParser.parse(searchQuery);
return indexSearcher.search(query, Constants.MAX_SEARCH);
}
@Override
public void init() throws IOException {
Directory indexDirectory = FSDirectory.open(Paths.get(indexDirectoryPath));
IndexReader reader = DirectoryReader.open(indexDirectory);
indexSearcher = new IndexSearcher(reader);
queryParser = new QueryParser(queryForField, new StandardAnalyzer());
}
@Override
public Document getDocument(ScoreDoc scoreDoc)
throws CorruptIndexException, IOException {
return indexSearcher.doc(scoreDoc.doc);
}
}
\ No newline at end of file
package com.search.lucene.settings;
public class Constants {
public static final String CONTENT = "content";
public static final String CSV_ROW = "csv_row";
public static final String FILE_NAME = "filename";
public static final String FILE_PATH = "filepath";
public static final String COLUMN = "column";
public static final String VALUE = "value";
public static final String ROW = "row";
public static final int MAX_SEARCH = 10;
}
package com.search.lucene.settings;
public final class IndexPaths {
public static final String CSV_INDEX_FOLDER_PATH = "csvIndex";
public static final String TEXT_INDEX_FOLDER_PATH = "textIndex";
public static final String PDF_INDEX_FOLDER_PATH = "pdfIndex";
}
package com.search.lucene.settings;
public final class RepresentationSchema {
public static final String TYPE = "type";
public static final String CSV = "CSV";
public static final String PDF = "PDF";
public static final String TEXT = "TEXT";
}
File added
File added
File added
File added
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment