Unique ID Generation Timeout TT

Hello everyone! I’m currently working on the Challenge #2: Unique ID Generation problem, and I have a question.

I’ve implemented my solution using a single leader node. All other nodes request their unique IDs through this leader. However, when I run the following command:

css

maelstrom test -w unique-ids --bin maelstrom --time-limit 30 \
  --rate 1000 --node-count 3 --availability total --nemesis partition

node n1 frequently times out, causing the test to fail. Sometimes it passes, but other times it fails intermittently. If you have any suggestions on what I might be doing wrong or how to improve my approach, I would really appreciate your help.

Below are excerpts of my code and some logs. Thank you in advance!

package main

import (
	"context"
	"encoding/json"
	"sync/atomic"

	maelstrom "github.com/jepsen-io/maelstrom/demo/go"
)

var ops atomic.Uint64

// maelstrom1/maelstrom test -w unique-ids --bin maelstrom --time-limit 30 --rate 1000 --node-count 3 --availability total --nemesis partition
func MaelstromGenerate(n *maelstrom.Node) {
	n.Handle("generate", func(msg maelstrom.Message) error {
		var body map[string]any
		if err := json.Unmarshal(msg.Body, &body); err != nil {
			return err
		}
		cur := uint64(0)
		for cur == 0 {
			cur = getCounter(n)
		}

		body["type"] = "generate_ok"
		body["id"] = cur

		return n.Reply(msg, body)
	})

	n.Handle("get_counter", func(msg maelstrom.Message) error {
		body := map[string]any{
			"id": getCounter(n),
		}
		n.Reply(msg, body)
		return nil
	})
}

func getCounter(n *maelstrom.Node) uint64 {
	if n.ID() == "n0" {
		return ops.Add(1)
	}

	resp, err := n.SyncRPC(context.Background(), "n0", map[string]any{"type": "get_counter"})
	if err != nil {
		return 0
	}

	var body map[string]any
	if err := json.Unmarshal(resp.Body, &body); err != nil {
		return 0
	}
	return uint64(body["id"].(float64))
}

# n1.log
2025/02/14 20:29:01 Received {c1 n1 {"type":"init","node_id":"n1","node_ids":["n0","n1","n2"],"msg_id":1}}
2025/02/14 20:29:01 Node n1 initialized
2025/02/14 20:29:01 Sent {"src":"n1","dest":"c1","body":{"in_reply_to":1,"type":"init_ok"}}
2025/02/14 20:29:01 Received {c6 n1 {"type":"generate","msg_id":1}}
2025/02/14 20:29:01 Sent {"src":"n1","dest":"n0","body":{"msg_id":1,"type":"get_counter"}}
2025/02/14 20:29:06 Received {c6 n1 {"type":"generate","msg_id":2}}
2025/02/14 20:29:06 Sent {"src":"n1","dest":"n0","body":{"msg_id":2,"type":"get_counter"}}
2025/02/14 20:29:11 Received {c6 n1 {"type":"generate","msg_id":3}}
2025/02/14 20:29:11 Sent {"src":"n1","dest":"n0","body":{"msg_id":3,"type":"get_counter"}}
2025/02/14 20:29:16 Received {c6 n1 {"type":"generate","msg_id":4}}
2025/02/14 20:29:16 Sent {"src":"n1","dest":"n0","body":{"msg_id":4,"type":"get_counter"}}
2025/02/14 20:29:21 Received {c6 n1 {"type":"generate","msg_id":5}} // may be timeout here!!
2025/02/14 20:29:21 Sent {"src":"n1","dest":"n0","body":{"msg_id":5,"type":"get_counter"}}
2025/02/14 20:29:21 Received {n0 n1 {"id":16492,"in_reply_to":5}}
2025/02/14 20:29:21 Sent {"src":"n1","dest":"c6","body":{"id":16492,"in_reply_to":5,"msg_id":5,"type":"generate_ok"}}
2025/02/14 20:29:21 Received {c6 n1 {"type":"generate","msg_id":6}}
2025/02/14 20:29:21 Sent {"src":"n1","dest":"n0","body":{"msg_id":6,"type":"get_counter"}}
2025/02/14 20:29:21 Received {n0 n1 {"id":16496,"in_reply_to":6}}
2025/02/14 20:29:21 Sent {"src":"n1","dest":"c6","body":{"id":16496,"in_reply_to":6,"msg_id":6,"type":"generate_ok"}}
2025/02/14 20:29:21 Received {c6 n1 {"type":"generate","msg_id":7}}
2025/02/14 20:29:21 Sent {"src":"n1","dest":"n0","body":{"msg_id":7,"type":"get_counter"}}
2025/02/14 20:29:21 Received {n0 n1 {"id":16500,"in_reply_to":7}}
2025/02/14 20:29:21 Sent {"src":"n1","dest":"c6","body":{"id":16500,"in_reply_to":7,"msg_id":7,"type":"generate_ok"}}
2025/02/14 20:29:21 Received {c6 n1 {"type":"generate","msg_id":8}}
2025/02/14 20:29:21 Sent {"src":"n1","dest":"n0","body":{"msg_id":8,"type":"get_counter"}}
2025/02/14 20:29:21 Received {n0 n1 {"id":16504,"in_reply_to":8}}
...
#cmd log
jepsen test runner - jepsen.core {:perf {:latency-graph {:valid? true},
        :rate-graph {:valid? true},
        :valid? true},
 :timeline {:valid? true},
 :exceptions {:valid? true},
 :stats {:valid? true,
         :count 24784,
         :ok-count 24780,
         :fail-count 0,
         :info-count 4,
         :by-f {:generate {:valid? true,
                           :count 24784,
                           :ok-count 24780,
                           :fail-count 0,
                           :info-count 4}}},
 :availability {:valid? false, :ok-fraction 0.9998386},
 :net {:all {:send-count 73884,
             :recv-count 73880,
             :msg-count 73884,
             :msgs-per-op 2.9811168},
       :clients {:send-count 49570,
                 :recv-count 49570,
                 :msg-count 49570},
       :servers {:send-count 24314,
                 :recv-count 24310,
                 :msg-count 24314,
                 :msgs-per-op 0.9810361},
       :valid? true},
 :workload {:valid? true,
            :attempted-count 24784,
            :acknowledged-count 24780,
            :duplicated-count 0,
            :duplicated {},
            :range [1 24780]},
 :valid? false}

(Author)
The reason why I failed in the above problem is that the problem assumes Network Partition, so we should have implemented AP system. But I implemented CA system. That’s why the timeout situation occurred.

In this challenge, you’ll need to implement a globally-unique ID generation system that runs against Maelstrom’s unique-ids workload. Your service should be totally available, meaning that it can continue to operate even in the face of network partitions.

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.