Commit 59db520e authored by drnull03's avatar drnull03

Clearning part2 and finalizing

parent 35837c95
**/target/ **/target/
**/*.log
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
<configuration> <configuration>
<transformers> <transformers>
<transformer> <transformer>
<mainClass>Application</mainClass> <mainClass>AutohealerApplication</mainClass>
</transformer> </transformer>
</transformers> </transformers>
</configuration> </configuration>
......
...@@ -73,7 +73,7 @@ ...@@ -73,7 +73,7 @@
<configuration> <configuration>
<transformers> <transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>Application</mainClass> <mainClass>AutohealerApplication</mainClass>
</transformer> </transformer>
</transformers> </transformers>
</configuration> </configuration>
......
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
public class Application {
private static final Logger logger = LoggerFactory.getLogger(Application.class);
public static void main(String[] args) throws IOException, KeeperException, InterruptedException {
if (args.length < 2) {
logger.error("Usage: java -jar autohealer.jar <number_of_workers> <path_to_worker_jar>");
System.exit(1);
}
int numWorkers = Integer.parseInt(args[0]);
String workerPath = args[1];
Autohealer autohealer = new Autohealer(numWorkers, workerPath);
autohealer.connectToZookeeper();
autohealer.startWatchingWorkers();
autohealer.run(); // blocks
}
}
...@@ -3,117 +3,108 @@ import org.slf4j.Logger; ...@@ -3,117 +3,108 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.File; import java.io.File;
import java.io.IOException;
import java.util.*; import java.util.*;
public class Autohealer implements Watcher { public class Autohealer implements Watcher {
private static final Logger logger = LoggerFactory.getLogger(Autohealer.class); private static final Logger logger = LoggerFactory.getLogger(Autohealer.class);
// Update to include all ensemble nodes private static final String ZK = "127.0.0.1:2181";
private static final String ZOOKEEPER_ADDRESS = "127.0.0.1:2181,127.0.0.1:2182,127.0.0.1:2183"; private static final int TIMEOUT = 3000;
private static final int SESSION_TIMEOUT = 3000;
private static final String WORKERS_PATH = "/workers";
private final String pathToWorkerJar; private static final String NODES = "/nodes";
private final int numberOfWorkers; private static final String WORKERS = "/workers";
private ZooKeeper zooKeeper;
// Simulate multiple physical nodes private final int desiredWorkers;
private final List<String> physicalNodes = Arrays.asList("node1", "node2", "node3"); private final String workerJar;
private ZooKeeper zk;
public Autohealer(int numberOfWorkers, String pathToWorkerJar) { public Autohealer(int desiredWorkers, String workerJar) {
this.numberOfWorkers = numberOfWorkers; this.desiredWorkers = desiredWorkers;
this.pathToWorkerJar = pathToWorkerJar; this.workerJar = workerJar;
} }
public void connectToZookeeper() throws IOException { public void connect() throws Exception {
this.zooKeeper = new ZooKeeper(ZOOKEEPER_ADDRESS, SESSION_TIMEOUT, this); zk = new ZooKeeper(ZK, TIMEOUT, this);
logger.info("Connecting to ZooKeeper ensemble: {}", ZOOKEEPER_ADDRESS);
} }
public void startWatchingWorkers() throws KeeperException, InterruptedException { public void bootstrap() throws Exception {
// Ensure parent znode exists ensure(NODES);
if (zooKeeper.exists(WORKERS_PATH, false) == null) { ensure(WORKERS);
zooKeeper.create(WORKERS_PATH, new byte[]{}, ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
logger.info("Created parent znode: {}", WORKERS_PATH);
}
checkAndLaunchWorkers(); rebalance();
zooKeeper.getChildren(WORKERS_PATH, this);
} }
public void run() throws InterruptedException { private void ensure(String path) throws Exception {
synchronized (zooKeeper) { if (zk.exists(path, false) == null) {
zooKeeper.wait(); // block main thread zk.create(path, new byte[]{}, ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
} }
} }
public void close() throws InterruptedException { public void run() throws InterruptedException {
zooKeeper.close(); synchronized (zk) {
logger.info("ZooKeeper connection closed"); zk.wait();
}
@Override
public void process(WatchedEvent event) {
if (event.getType() == Event.EventType.None) {
if (event.getState() == Event.KeeperState.SyncConnected) {
logger.info("Successfully connected to ZooKeeper");
} else {
synchronized (zooKeeper) {
logger.warn("Disconnected from ZooKeeper");
zooKeeper.notifyAll();
}
} }
return;
} }
if (event.getType() == Event.EventType.NodeChildrenChanged && event.getPath().equals(WORKERS_PATH)) { @Override
public void process(WatchedEvent e) {
try { try {
logger.info("Workers changed, checking cluster health..."); rebalance();
checkAndLaunchWorkers(); } catch (Exception ex) {
} catch (Exception e) { logger.error("Rebalance failed", ex);
logger.error("Error checking/launching workers", e);
} }
} }
// Re-set the watch private void rebalance() throws Exception {
try { List<String> nodes = zk.getChildren(NODES, this);
zooKeeper.getChildren(WORKERS_PATH, this); List<String> workers = zk.getChildren(WORKERS, this);
} catch (Exception e) {
logger.error("Failed to reset watch", e); if (nodes.isEmpty()) return;
}
Map<String, List<String>> byNode = new HashMap<>();
for (String n : nodes) byNode.put(n, new ArrayList<>());
for (String w : workers) {
byte[] data = zk.getData(WORKERS + "/" + w, false, null);
String node = new String(data);
byNode.computeIfAbsent(node, k -> new ArrayList<>()).add(w);
} }
private void checkAndLaunchWorkers() throws KeeperException, InterruptedException { // worker failure → restart on same node
List<String> children = zooKeeper.getChildren(WORKERS_PATH, false); while (workers.size() < desiredWorkers) {
int currentWorkers = children.size(); String node = leastLoaded(byNode);
int toLaunch = numberOfWorkers - currentWorkers; startWorker(node);
workers.add("new");
}
if (toLaunch > 0) { // node failure → redistribute
logger.info("Need to launch {} new worker(s)", toLaunch); for (String deadNode : new HashSet<>(byNode.keySet())) {
for (int i = 0; i < toLaunch; i++) { if (!nodes.contains(deadNode)) {
try { for (String w : byNode.get(deadNode)) {
String node = selectNodeForWorker(children); String target = leastLoaded(byNode);
startNewWorker(node); startWorker(target);
} catch (IOException e) {
logger.error("Failed to start new worker", e);
} }
byNode.remove(deadNode);
} }
} else {
logger.debug("All workers are running, no action needed");
} }
} }
// Simple round-robin assignment of worker to a physical node private String leastLoaded(Map<String, List<String>> map) {
private String selectNodeForWorker(List<String> currentWorkers) { return map.entrySet()
int idx = currentWorkers.size() % physicalNodes.size(); .stream()
return physicalNodes.get(idx); .min(Comparator.comparingInt(e -> e.getValue().size()))
.get().getKey();
} }
private void startNewWorker(String node) throws IOException { private void startWorker(String node) throws Exception {
File file = new File(pathToWorkerJar); File jar = new File(workerJar);
String command = String.format("ssh %s java -jar %s", node, file.getAbsolutePath());
logger.info("Launching worker on {}: {}", node, command); ProcessBuilder pb = new ProcessBuilder(
Runtime.getRuntime().exec(command, null, file.getParentFile()); "java", "-jar", jar.getAbsolutePath()
);
pb.environment().put("NODE_ID", node);
pb.start();
logger.info("Started worker on {}", node);
} }
} }
public class AutohealerApplication {
public static void main(String[] args) throws Exception {
int desiredWorkers = Integer.parseInt(args[0]);
String workerJar = args[1];
Autohealer healer = new Autohealer(desiredWorkers, workerJar);
healer.connect();
healer.bootstrap();
healer.run();
}
}
<configuration> <configuration>
<property name="NODE" value="${node:-leader}"/>
<!-- Console logger: minimal --> <appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> <file>logs/${NODE}.log</file>
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
<fileNamePattern>logs/${NODE}.%d{yyyy-MM-dd}.log</fileNamePattern>
</rollingPolicy>
<encoder> <encoder>
<pattern>%d{HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n</pattern> <pattern>%d %-5level [%thread] %logger - %msg%n</pattern>
</encoder> </encoder>
</appender> </appender>
<!-- File logger -->
<appender name="FILE" class="ch.qos.logback.core.FileAppender">
<file>worker.log</file>
<append>true</append>
<encoder>
<pattern>%d{yyyy-MM-dd HH:mm:ss} %-5level %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<!-- Root logger -->
<root level="INFO"> <root level="INFO">
<appender-ref ref="CONSOLE"/>
<appender-ref ref="FILE"/> <appender-ref ref="FILE"/>
</root> </root>
</configuration> </configuration>
This source diff could not be displayed because it is too large. You can view the blob instead.
/opt/zookeeper/bin/zkServer.sh start /opt/zookeeper/conf/zoo1.cfg
/opt/zookeeper/bin/zkServer.sh start /opt/zookeeper/conf/zoo2.cfg # launch one physical node
/opt/zookeeper/bin/zkServer.sh start /opt/zookeeper/conf/zoo3.cfg /opt/zookeeper/bin/zkServer.sh start ./standalone.cfg
#!/bin/bash
/opt/zookeeper/bin/zkServer.sh create /nodes ""
/opt/zookeeper/bin/zkCli.sh create /nodes/nodeA ""
/opt/zookeeper/bin/zkCli.sh create /nodes/nodeB ""
/opt/zookeeper/bin/zkCli.sh create /nodes/nodeC ""
tickTime=2000
dataDir=/data/zookeeper1
clientPort=2181
initLimit=5
syncLimit=2
...@@ -2,8 +2,6 @@ import org.apache.zookeeper.KeeperException; ...@@ -2,8 +2,6 @@ import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException;
public class Application { public class Application {
private static final Logger logger = LoggerFactory.getLogger(Application.class); private static final Logger logger = LoggerFactory.getLogger(Application.class);
...@@ -14,11 +12,8 @@ public class Application { ...@@ -14,11 +12,8 @@ public class Application {
Worker worker = new Worker(); Worker worker = new Worker();
worker.connectToZookeeper(); worker.connectToZookeeper();
worker.work(); worker.work();
} catch (IOException | KeeperException | InterruptedException e) { } catch (Exception e) {
logger.error("Worker failed with exception", e); logger.error("Worker crashed", e);
System.exit(1);
} catch (RuntimeException e) {
logger.error("Critical failure, shutting down worker", e);
System.exit(1); System.exit(1);
} }
} }
......
import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.*;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.ZooDefs;
import org.apache.zookeeper.ZooKeeper;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
...@@ -12,43 +9,36 @@ import java.util.concurrent.locks.LockSupport; ...@@ -12,43 +9,36 @@ import java.util.concurrent.locks.LockSupport;
public class Worker { public class Worker {
private static final Logger logger = LoggerFactory.getLogger(Worker.class); private static final Logger logger = LoggerFactory.getLogger(Worker.class);
// Update this if you run a cluster private static final String ZK = "127.0.0.1:2181";
private static final String ZOOKEEPER_ADDRESS = "127.0.0.1:2181"; private static final int TIMEOUT = 3000;
private static final int SESSION_TIMEOUT = 3000; private static final String WORKERS = "/workers";
private static final String AUTOHEALER_ZNODES_PATH = "/workers"; private static final float FAIL_RATE = 0.1f;
private static final float CHANCE_TO_FAIL = 0.1F;
private ZooKeeper zk;
private final Random random = new Random(); private final Random random = new Random();
private ZooKeeper zooKeeper;
public void connectToZookeeper() throws IOException { public void connectToZookeeper() throws IOException {
this.zooKeeper = new ZooKeeper(ZOOKEEPER_ADDRESS, SESSION_TIMEOUT, event -> {}); zk = new ZooKeeper(ZK, TIMEOUT, e -> {});
logger.info("Connected to ZooKeeper at {}", ZOOKEEPER_ADDRESS);
} }
public void work() throws KeeperException, InterruptedException { public void work() throws Exception {
addChildZnode(); String nodeId = System.getenv("NODE_ID");
logger.info("Worker node created, starting work loop...");
while (true) { zk.create(
// Minimal console output, detailed logs go to file WORKERS + "/worker-",
logger.debug("Working..."); nodeId.getBytes(),
ZooDefs.Ids.OPEN_ACL_UNSAFE,
CreateMode.EPHEMERAL_SEQUENTIAL
);
LockSupport.parkNanos(100_000_000); // ~0.1 second logger.info("Worker running on node {}", nodeId);
if (random.nextFloat() < CHANCE_TO_FAIL) { while (true) {
logger.error("Critical error happened, exiting..."); LockSupport.parkNanos(100_000_000);
throw new RuntimeException("Worker simulated failure"); if (random.nextFloat() < FAIL_RATE) {
} throw new RuntimeException("Simulated failure");
} }
} }
private void addChildZnode() throws KeeperException, InterruptedException {
zooKeeper.create(AUTOHEALER_ZNODES_PATH + "/worker_",
new byte[]{},
ZooDefs.Ids.OPEN_ACL_UNSAFE,
CreateMode.EPHEMERAL_SEQUENTIAL);
logger.info("Registered ephemeral znode under {}", AUTOHEALER_ZNODES_PATH);
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment