struct Node {
char symbol; // The character (symbol) stored in this node; '\0' for internal nodes
long long freq; // Frequency (weight) of this node/subtree
shared_ptr<Node> left; // Pointer to left child (represents '0' bit)
shared_ptr<Node> right; // Pointer to right child (represents '1' bit)
// Constructor for leaf nodes
Node(const char s, const long long f)
: symbol(s), freq(f), left(nullptr), right(nullptr) {
}
// Constructor for internal nodes: merge two subtrees
Node(const shared_ptr<Node> &l, const shared_ptr<Node> &r)
: symbol('\0'), freq(l->freq + r->freq), left(l), right(r) {
}
};
// Node represents a node in the Huffman tree
static class Node implements Comparable<Node> {
char symbol; // The character (symbol) stored in this node; '\0' for internal nodes
int freq; // Frequency (weight) of this node/subtree
Node left; // Pointer to left child (represents '0' bit)
Node right; // Pointer to right child (represents '1' bit)
// Constructor for leaf nodes
Node(char symbol, int freq) {
this.symbol = symbol;
this.freq = freq;
this.left = this.right = null;
}
// Constructor for internal nodes: merge two subtrees
Node(Node left, Node right) {
this.symbol = '\0';
this.freq = left.freq + right.freq;
this.left = left;
this.right = right;
}
// Compare Nodes
@Override
public int compareTo(Node other) {
return Integer.compare(this.freq, other.freq);
}
}
Leaves carry a concrete symbol and its raw frequency.
Internal nodes have symbol == '\0'`` and freq = left->freq + right->freq`.
struct Compare {
bool operator()(const shared_ptr<Node> &a, const shared_ptr<Node> &b) const {
// Return true if a has higher freq than b, so b is processed first
return a->freq > b->freq;
}
};
static class Node implements Comparable<Node> {
// Compare Nodes
@Override
public int compareTo(Node other) {
return Integer.compare(this.freq, other.freq);
}
}
shared_ptr<Node> buildHuffmanTree(const string &data) {
// 1) Count character frequencies
unordered_map<char, long long> freq;
for (const auto &c: data) {
freq[c]++;
}
// 2) Initialize a min-heap of one-node trees
priority_queue<shared_ptr<Node>, vector<shared_ptr<Node> >, Compare> pq;
for (const auto &p: freq) {
pq.push(make_shared<Node>(p.first, p.second));
}
// 3) Merge until one tree remains
while (pq.size() > 1) {
// Extract two nodes with smallest frequencies
auto left = pq.top();
pq.pop();
auto right = pq.top();
pq.pop();
// Create a new parent node with combined frequency
pq.push(make_shared<Node>(left, right));
}
// The remaining node is the root of the Huffman tree
return pq.top();
}
public static Node buildHuffmanTree(String data) {
// 1) Count character frequencies
Map<Character, Integer> freq = new HashMap<>();
for (char ch : data.toCharArray()) {
freq.put(ch, freq.getOrDefault(ch, 0) + 1);
}
// 2) Initialize a min-heap of one-node trees
PriorityQueue<Node> pq = new PriorityQueue<>();
for (Map.Entry<Character, Integer> entry : freq.entrySet()) {
pq.add(new Node(entry.getKey(), entry.getValue()));
}
// 3) Merge until one tree remains
while (pq.size() > 1) {
// Extract two nodes with smallest frequencies
Node left = pq.poll();
Node right = pq.poll();
// Create a new parent node with combined frequency
pq.add(new Node(left, right));
}
// The remaining node is the root of the Huffman tree
return pq.poll();
}
Step 1 collects frequencies in a hash map.
Step 2 seeds the priority queue with leaf nodes.
Step 3 repeatedly extracts two smallest‐weight nodes, merges them under a new internal node, and reinserts—ultimately yielding a single root.
With the completed tree, a depth-first traversal assigns bit-strings:
void generateCodes(const shared_ptr<Node> &node,
const string &prefix,
map<char, string> &codebook) {
if (!node) return;
if (node->symbol != '\0') {
// If this is a leaf node, assign the code (use "0" if only one symbol)
codebook[node->symbol] = prefix.empty() ? "0" : prefix;
} else {
// Internal: go left (add '0') and right (add '1')
generateCodes(node->left, prefix + "0", codebook);
generateCodes(node->right, prefix + "1", codebook);
}
}
public static void generateCodes(Node node, String prefix, Map<Character, String> codeboo) {
if (node == null) return;
if (node.symbol != '\0') {
// If this is a leaf node, assign the code (use "0" if only one symbol)
codebook.put(node.symbol, prefix.isEmpty() ? "0" : prefix);
} else {
// Internal: go left (add '0') and right (add '1')
generateCodes(node.left, prefix + '0', codebook);
generateCodes(node.right, prefix + '1', codebook);
}
}
int main() {
// Example input string to encode
string text = "AAABBBBBAAABBDDCCCCCDDDDBBAAAAAAAAAAACCCCAAAEEEEAAADDDAADAAAABBAADDACCACAAAAEEEBBEEAAAADDDDAA";
// Build Huffman tree based on character frequencies
auto root = buildHuffmanTree(text);
// Generate the codebook (mapping from char to codeword)
map<char, string> codes;
generateCodes(root, "", codes);
// Output the resulting Huffman codes
cout << "Codewords:\n";
for (auto &p: codes) {
cout << "'" << p.first << "': " << p.second << "\n";
}
return 0;
}
public static void main(String[] args) {
System.setOut(new PrintStream(System.out, true, StandardCharsets.UTF_8));
// Example input string to encode
String text = "AAABBBBBAAABBDDCCCCCDDDDBBAAAAAAAAAAACCCCAAAEEEEAAADDDAADAAAABBAADDACCACAAAAEEEBBEEAAAADDDDAA";
// Build Huffman tree based on character frequencies
Node root = buildHuffmanTree(text);
// Generate the codebook (mapping from char to codeword)
Map<Character, String> codes = new HashMap<>();
generateCodes(root, "", codes);
// Output the resulting Huffman codes
System.out.println("Codewords:");
for (Map.Entry<Character, String> entry : codes.entrySet()) {
System.out.println("'" + entry.getKey() + "': " + entry.getValue());
}
}
We insert each of the $n$ distinct symbols into a min-heap (priority queue) based on its frequency. Building the heap in $n$ insertions costs $O(n \log n)$.
We perform exactly $n−1$ merge steps, and each step removes two nodes and reinserts their parent — three heap operations altogether. Since each such operation is $O(\log n)$, this phase is also $O(n \log n)$.
If the symbol alphabet is bounded (for example ASCII or Unicode blocks), nn is effectively constant and the entire algorithm runs in linear time $O(N)$.
In the extreme case where every input character is unique ($n=N$), it becomes $O(N \log N)$.
Thus, Huffman coding scales almost linearly in practice and remains efficient even for large inputs with moderately sized alphabets.