Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Sign in
Toggle navigation
A
AutoHealer
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
diaa.hanna
AutoHealer
Commits
59db520e
Commit
59db520e
authored
Dec 26, 2025
by
drnull03
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Clearning part2 and finalizing
parent
35837c95
Changes
13
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
4415 additions
and
873 deletions
+4415
-873
.gitignore
.gitignore
+1
-1
dependency-reduced-pom.xml
Homework/part2/autohealer/dependency-reduced-pom.xml
+1
-1
pom.xml
Homework/part2/autohealer/pom.xml
+1
-1
Application.java
Homework/part2/autohealer/src/main/java/Application.java
+0
-24
Autohealer.java
Homework/part2/autohealer/src/main/java/Autohealer.java
+69
-78
AutohealerApplication.java
...part2/autohealer/src/main/java/AutohealerApplication.java
+11
-0
logback.xml
Homework/part2/autohealer/src/main/resources/logback.xml
+7
-15
worker.log
Homework/part2/autohealer/worker.log
+4288
-713
launch.sh
Homework/part2/launch.sh
+3
-3
nodeReg.sh
Homework/part2/nodeReg.sh
+7
-0
standalone.cfg
Homework/part2/standalone.cfg
+5
-0
Application.java
...work/part2/transientworker/src/main/java/Application.java
+2
-7
Worker.java
Homework/part2/transientworker/src/main/java/Worker.java
+20
-30
No files found.
.gitignore
View file @
59db520e
**/target/
**/target/
**/*.log
Homework/part2/autohealer/dependency-reduced-pom.xml
View file @
59db520e
...
@@ -26,7 +26,7 @@
...
@@ -26,7 +26,7 @@
<configuration>
<configuration>
<transformers>
<transformers>
<transformer>
<transformer>
<mainClass>
Application
</mainClass>
<mainClass>
A
utohealerA
pplication
</mainClass>
</transformer>
</transformer>
</transformers>
</transformers>
</configuration>
</configuration>
...
...
Homework/part2/autohealer/pom.xml
View file @
59db520e
...
@@ -73,7 +73,7 @@
...
@@ -73,7 +73,7 @@
<configuration>
<configuration>
<transformers>
<transformers>
<transformer
implementation=
"org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"
>
<transformer
implementation=
"org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"
>
<mainClass>
Application
</mainClass>
<mainClass>
A
utohealerA
pplication
</mainClass>
</transformer>
</transformer>
</transformers>
</transformers>
</configuration>
</configuration>
...
...
Homework/part2/autohealer/src/main/java/Application.java
deleted
100644 → 0
View file @
35837c95
import
org.apache.zookeeper.KeeperException
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
java.io.IOException
;
public
class
Application
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Application
.
class
);
public
static
void
main
(
String
[]
args
)
throws
IOException
,
KeeperException
,
InterruptedException
{
if
(
args
.
length
<
2
)
{
logger
.
error
(
"Usage: java -jar autohealer.jar <number_of_workers> <path_to_worker_jar>"
);
System
.
exit
(
1
);
}
int
numWorkers
=
Integer
.
parseInt
(
args
[
0
]);
String
workerPath
=
args
[
1
];
Autohealer
autohealer
=
new
Autohealer
(
numWorkers
,
workerPath
);
autohealer
.
connectToZookeeper
();
autohealer
.
startWatchingWorkers
();
autohealer
.
run
();
// blocks
}
}
Homework/part2/autohealer/src/main/java/Autohealer.java
View file @
59db520e
...
@@ -3,117 +3,108 @@ import org.slf4j.Logger;
...
@@ -3,117 +3,108 @@ import org.slf4j.Logger;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
import
java.io.File
;
import
java.io.File
;
import
java.io.IOException
;
import
java.util.*
;
import
java.util.*
;
public
class
Autohealer
implements
Watcher
{
public
class
Autohealer
implements
Watcher
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Autohealer
.
class
);
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Autohealer
.
class
);
// Update to include all ensemble nodes
private
static
final
String
ZK
=
"127.0.0.1:2181"
;
private
static
final
String
ZOOKEEPER_ADDRESS
=
"127.0.0.1:2181,127.0.0.1:2182,127.0.0.1:2183"
;
private
static
final
int
TIMEOUT
=
3000
;
private
static
final
int
SESSION_TIMEOUT
=
3000
;
private
static
final
String
WORKERS_PATH
=
"/workers"
;
private
final
String
pathToWorkerJar
;
private
static
final
String
NODES
=
"/nodes"
;
private
final
int
numberOfWorkers
;
private
static
final
String
WORKERS
=
"/workers"
;
private
ZooKeeper
zooKeeper
;
// Simulate multiple physical nodes
private
final
int
desiredWorkers
;
private
final
List
<
String
>
physicalNodes
=
Arrays
.
asList
(
"node1"
,
"node2"
,
"node3"
);
private
final
String
workerJar
;
private
ZooKeeper
zk
;
public
Autohealer
(
int
numberOfWorkers
,
String
pathToW
orkerJar
)
{
public
Autohealer
(
int
desiredWorkers
,
String
w
orkerJar
)
{
this
.
numberOfWorkers
=
numberOf
Workers
;
this
.
desiredWorkers
=
desired
Workers
;
this
.
pathToWorkerJar
=
pathToW
orkerJar
;
this
.
workerJar
=
w
orkerJar
;
}
}
public
void
connectToZookeeper
()
throws
IOException
{
public
void
connect
()
throws
Exception
{
this
.
zooKeeper
=
new
ZooKeeper
(
ZOOKEEPER_ADDRESS
,
SESSION_TIMEOUT
,
this
);
zk
=
new
ZooKeeper
(
ZK
,
TIMEOUT
,
this
);
logger
.
info
(
"Connecting to ZooKeeper ensemble: {}"
,
ZOOKEEPER_ADDRESS
);
}
}
public
void
startWatchingWorkers
()
throws
KeeperException
,
InterruptedException
{
public
void
bootstrap
()
throws
Exception
{
// Ensure parent znode exists
ensure
(
NODES
);
if
(
zooKeeper
.
exists
(
WORKERS_PATH
,
false
)
==
null
)
{
ensure
(
WORKERS
);
zooKeeper
.
create
(
WORKERS_PATH
,
new
byte
[]{},
ZooDefs
.
Ids
.
OPEN_ACL_UNSAFE
,
CreateMode
.
PERSISTENT
);
logger
.
info
(
"Created parent znode: {}"
,
WORKERS_PATH
);
}
checkAndLaunchWorkers
();
rebalance
();
zooKeeper
.
getChildren
(
WORKERS_PATH
,
this
);
}
}
p
ublic
void
run
()
throws
Interrupted
Exception
{
p
rivate
void
ensure
(
String
path
)
throws
Exception
{
synchronized
(
zooKeeper
)
{
if
(
zk
.
exists
(
path
,
false
)
==
null
)
{
z
ooKeeper
.
wait
();
// block main thread
z
k
.
create
(
path
,
new
byte
[]{},
ZooDefs
.
Ids
.
OPEN_ACL_UNSAFE
,
CreateMode
.
PERSISTENT
);
}
}
}
}
public
void
close
()
throws
InterruptedException
{
public
void
run
()
throws
InterruptedException
{
zooKeeper
.
close
();
synchronized
(
zk
)
{
logger
.
info
(
"ZooKeeper connection closed"
);
zk
.
wait
();
}
}
}
@Override
@Override
public
void
process
(
WatchedEvent
event
)
{
public
void
process
(
WatchedEvent
e
)
{
if
(
event
.
getType
()
==
Event
.
EventType
.
None
)
{
try
{
if
(
event
.
getState
()
==
Event
.
KeeperState
.
SyncConnected
)
{
rebalance
();
logger
.
info
(
"Successfully connected to ZooKeeper"
);
}
catch
(
Exception
ex
)
{
}
else
{
logger
.
error
(
"Rebalance failed"
,
ex
);
synchronized
(
zooKeeper
)
{
logger
.
warn
(
"Disconnected from ZooKeeper"
);
zooKeeper
.
notifyAll
();
}
}
return
;
}
}
}
if
(
event
.
getType
()
==
Event
.
EventType
.
NodeChildrenChanged
&&
event
.
getPath
().
equals
(
WORKERS_PATH
))
{
private
void
rebalance
()
throws
Exception
{
try
{
List
<
String
>
nodes
=
zk
.
getChildren
(
NODES
,
this
);
logger
.
info
(
"Workers changed, checking cluster health..."
);
List
<
String
>
workers
=
zk
.
getChildren
(
WORKERS
,
this
);
checkAndLaunchWorkers
();
}
catch
(
Exception
e
)
{
if
(
nodes
.
isEmpty
())
return
;
logger
.
error
(
"Error checking/launching workers"
,
e
);
}
Map
<
String
,
List
<
String
>>
byNode
=
new
HashMap
<>();
for
(
String
n
:
nodes
)
byNode
.
put
(
n
,
new
ArrayList
<>());
for
(
String
w
:
workers
)
{
byte
[]
data
=
zk
.
getData
(
WORKERS
+
"/"
+
w
,
false
,
null
);
String
node
=
new
String
(
data
);
byNode
.
computeIfAbsent
(
node
,
k
->
new
ArrayList
<>()).
add
(
w
);
}
}
//
Re-set the watch
//
worker failure → restart on same node
try
{
while
(
workers
.
size
()
<
desiredWorkers
)
{
zooKeeper
.
getChildren
(
WORKERS_PATH
,
this
);
String
node
=
leastLoaded
(
byNode
);
}
catch
(
Exception
e
)
{
startWorker
(
node
);
logger
.
error
(
"Failed to reset watch"
,
e
);
workers
.
add
(
"new"
);
}
}
}
private
void
checkAndLaunchWorkers
()
throws
KeeperException
,
InterruptedException
{
// node failure → redistribute
List
<
String
>
children
=
zooKeeper
.
getChildren
(
WORKERS_PATH
,
false
);
for
(
String
deadNode
:
new
HashSet
<>(
byNode
.
keySet
()))
{
int
currentWorkers
=
children
.
size
();
if
(!
nodes
.
contains
(
deadNode
))
{
int
toLaunch
=
numberOfWorkers
-
currentWorkers
;
for
(
String
w
:
byNode
.
get
(
deadNode
))
{
String
target
=
leastLoaded
(
byNode
);
if
(
toLaunch
>
0
)
{
startWorker
(
target
);
logger
.
info
(
"Need to launch {} new worker(s)"
,
toLaunch
);
for
(
int
i
=
0
;
i
<
toLaunch
;
i
++)
{
try
{
String
node
=
selectNodeForWorker
(
children
);
startNewWorker
(
node
);
}
catch
(
IOException
e
)
{
logger
.
error
(
"Failed to start new worker"
,
e
);
}
}
byNode
.
remove
(
deadNode
);
}
}
}
else
{
logger
.
debug
(
"All workers are running, no action needed"
);
}
}
}
}
// Simple round-robin assignment of worker to a physical node
private
String
leastLoaded
(
Map
<
String
,
List
<
String
>>
map
)
{
private
String
selectNodeForWorker
(
List
<
String
>
currentWorkers
)
{
return
map
.
entrySet
()
int
idx
=
currentWorkers
.
size
()
%
physicalNodes
.
size
();
.
stream
()
return
physicalNodes
.
get
(
idx
);
.
min
(
Comparator
.
comparingInt
(
e
->
e
.
getValue
().
size
()))
.
get
().
getKey
();
}
}
private
void
startNewWorker
(
String
node
)
throws
IOException
{
private
void
startWorker
(
String
node
)
throws
Exception
{
File
file
=
new
File
(
pathToWorkerJar
);
File
jar
=
new
File
(
workerJar
);
String
command
=
String
.
format
(
"ssh %s java -jar %s"
,
node
,
file
.
getAbsolutePath
());
logger
.
info
(
"Launching worker on {}: {}"
,
node
,
command
);
ProcessBuilder
pb
=
new
ProcessBuilder
(
Runtime
.
getRuntime
().
exec
(
command
,
null
,
file
.
getParentFile
());
"java"
,
"-jar"
,
jar
.
getAbsolutePath
()
);
pb
.
environment
().
put
(
"NODE_ID"
,
node
);
pb
.
start
();
logger
.
info
(
"Started worker on {}"
,
node
);
}
}
}
}
Homework/part2/autohealer/src/main/java/AutohealerApplication.java
0 → 100644
View file @
59db520e
public
class
AutohealerApplication
{
public
static
void
main
(
String
[]
args
)
throws
Exception
{
int
desiredWorkers
=
Integer
.
parseInt
(
args
[
0
]);
String
workerJar
=
args
[
1
];
Autohealer
healer
=
new
Autohealer
(
desiredWorkers
,
workerJar
);
healer
.
connect
();
healer
.
bootstrap
();
healer
.
run
();
}
}
Homework/part2/autohealer/src/main/resources/logback.xml
View file @
59db520e
<configuration>
<configuration>
<property
name=
"NODE"
value=
"${node:-leader}"
/>
<!-- Console logger: minimal -->
<appender
name=
"FILE"
class=
"ch.qos.logback.core.rolling.RollingFileAppender"
>
<appender
name=
"CONSOLE"
class=
"ch.qos.logback.core.ConsoleAppender"
>
<file>
logs/${NODE}.log
</file>
<rollingPolicy
class=
"ch.qos.logback.core.rolling.TimeBasedRollingPolicy"
>
<fileNamePattern>
logs/${NODE}.%d{yyyy-MM-dd}.log
</fileNamePattern>
</rollingPolicy>
<encoder>
<encoder>
<pattern>
%d
{HH:mm:ss} [%thread] %-5level %logger{36}
- %msg%n
</pattern>
<pattern>
%d
%-5level [%thread] %logger
- %msg%n
</pattern>
</encoder>
</encoder>
</appender>
</appender>
<!-- File logger -->
<appender
name=
"FILE"
class=
"ch.qos.logback.core.FileAppender"
>
<file>
worker.log
</file>
<append>
true
</append>
<encoder>
<pattern>
%d{yyyy-MM-dd HH:mm:ss} %-5level %logger{36} - %msg%n
</pattern>
</encoder>
</appender>
<!-- Root logger -->
<root
level=
"INFO"
>
<root
level=
"INFO"
>
<appender-ref
ref=
"CONSOLE"
/>
<appender-ref
ref=
"FILE"
/>
<appender-ref
ref=
"FILE"
/>
</root>
</root>
</configuration>
</configuration>
Homework/part2/autohealer/worker.log
View file @
59db520e
This diff is collapsed.
Click to expand it.
Homework/part2/launch.sh
View file @
59db520e
/opt/zookeeper/bin/zkServer.sh start /opt/zookeeper/conf/zoo1.cfg
/opt/zookeeper/bin/zkServer.sh start /opt/zookeeper/conf/zoo2.cfg
# launch one physical node
/opt/zookeeper/bin/zkServer.sh start
/opt/zookeeper/conf/zoo3
.cfg
/opt/zookeeper/bin/zkServer.sh start
./standalone
.cfg
Homework/part2/nodeReg.sh
0 → 100755
View file @
59db520e
#!/bin/bash
/opt/zookeeper/bin/zkServer.sh create /nodes
""
/opt/zookeeper/bin/zkCli.sh create /nodes/nodeA
""
/opt/zookeeper/bin/zkCli.sh create /nodes/nodeB
""
/opt/zookeeper/bin/zkCli.sh create /nodes/nodeC
""
Homework/part2/standalone.cfg
0 → 100644
View file @
59db520e
tickTime=2000
dataDir=/data/zookeeper1
clientPort=2181
initLimit=5
syncLimit=2
Homework/part2/transientworker/src/main/java/Application.java
View file @
59db520e
...
@@ -2,8 +2,6 @@ import org.apache.zookeeper.KeeperException;
...
@@ -2,8 +2,6 @@ import org.apache.zookeeper.KeeperException;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
import
java.io.IOException
;
public
class
Application
{
public
class
Application
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Application
.
class
);
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Application
.
class
);
...
@@ -14,11 +12,8 @@ public class Application {
...
@@ -14,11 +12,8 @@ public class Application {
Worker
worker
=
new
Worker
();
Worker
worker
=
new
Worker
();
worker
.
connectToZookeeper
();
worker
.
connectToZookeeper
();
worker
.
work
();
worker
.
work
();
}
catch
(
IOException
|
KeeperException
|
InterruptedException
e
)
{
}
catch
(
Exception
e
)
{
logger
.
error
(
"Worker failed with exception"
,
e
);
logger
.
error
(
"Worker crashed"
,
e
);
System
.
exit
(
1
);
}
catch
(
RuntimeException
e
)
{
logger
.
error
(
"Critical failure, shutting down worker"
,
e
);
System
.
exit
(
1
);
System
.
exit
(
1
);
}
}
}
}
...
...
Homework/part2/transientworker/src/main/java/Worker.java
View file @
59db520e
import
org.apache.zookeeper.CreateMode
;
import
org.apache.zookeeper.*
;
import
org.apache.zookeeper.KeeperException
;
import
org.apache.zookeeper.ZooDefs
;
import
org.apache.zookeeper.ZooKeeper
;
import
org.slf4j.Logger
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.slf4j.LoggerFactory
;
...
@@ -12,43 +9,36 @@ import java.util.concurrent.locks.LockSupport;
...
@@ -12,43 +9,36 @@ import java.util.concurrent.locks.LockSupport;
public
class
Worker
{
public
class
Worker
{
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Worker
.
class
);
private
static
final
Logger
logger
=
LoggerFactory
.
getLogger
(
Worker
.
class
);
// Update this if you run a cluster
private
static
final
String
ZK
=
"127.0.0.1:2181"
;
private
static
final
String
ZOOKEEPER_ADDRESS
=
"127.0.0.1:2181"
;
private
static
final
int
TIMEOUT
=
3000
;
private
static
final
int
SESSION_TIMEOUT
=
3000
;
private
static
final
String
WORKERS
=
"/workers"
;
private
static
final
String
AUTOHEALER_ZNODES_PATH
=
"/workers"
;
private
static
final
float
FAIL_RATE
=
0.1f
;
private
static
final
float
CHANCE_TO_FAIL
=
0.1
F
;
private
ZooKeeper
zk
;
private
final
Random
random
=
new
Random
();
private
final
Random
random
=
new
Random
();
private
ZooKeeper
zooKeeper
;
public
void
connectToZookeeper
()
throws
IOException
{
public
void
connectToZookeeper
()
throws
IOException
{
this
.
zooKeeper
=
new
ZooKeeper
(
ZOOKEEPER_ADDRESS
,
SESSION_TIMEOUT
,
event
->
{});
zk
=
new
ZooKeeper
(
ZK
,
TIMEOUT
,
e
->
{});
logger
.
info
(
"Connected to ZooKeeper at {}"
,
ZOOKEEPER_ADDRESS
);
}
}
public
void
work
()
throws
KeeperException
,
InterruptedException
{
public
void
work
()
throws
Exception
{
addChildZnode
();
String
nodeId
=
System
.
getenv
(
"NODE_ID"
);
logger
.
info
(
"Worker node created, starting work loop..."
);
while
(
true
)
{
zk
.
create
(
// Minimal console output, detailed logs go to file
WORKERS
+
"/worker-"
,
logger
.
debug
(
"Working..."
);
nodeId
.
getBytes
(),
ZooDefs
.
Ids
.
OPEN_ACL_UNSAFE
,
CreateMode
.
EPHEMERAL_SEQUENTIAL
);
LockSupport
.
parkNanos
(
100_000_000
);
// ~0.1 second
logger
.
info
(
"Worker running on node {}"
,
nodeId
);
if
(
random
.
nextFloat
()
<
CHANCE_TO_FAIL
)
{
while
(
true
)
{
logger
.
error
(
"Critical error happened, exiting..."
);
LockSupport
.
parkNanos
(
100_000_000
);
throw
new
RuntimeException
(
"Worker simulated failure"
);
if
(
random
.
nextFloat
()
<
FAIL_RATE
)
{
throw
new
RuntimeException
(
"Simulated failure"
);
}
}
}
}
}
}
private
void
addChildZnode
()
throws
KeeperException
,
InterruptedException
{
zooKeeper
.
create
(
AUTOHEALER_ZNODES_PATH
+
"/worker_"
,
new
byte
[]{},
ZooDefs
.
Ids
.
OPEN_ACL_UNSAFE
,
CreateMode
.
EPHEMERAL_SEQUENTIAL
);
logger
.
info
(
"Registered ephemeral znode under {}"
,
AUTOHEALER_ZNODES_PATH
);
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment