diff --git a/src/structsvm/hamming_costs.py b/src/structsvm/hamming_costs.py
index 4a6a86a..2b26078 100644
--- a/src/structsvm/hamming_costs.py
+++ b/src/structsvm/hamming_costs.py
@@ -27,3 +27,4 @@ def __init__(
 
         self.set_coefficients(coefficients)
         self.set_offset(offset)
+        self.set_scaling_factor(np.sum(mask) if mask is not None else ground_truth.size)
diff --git a/src/structsvm/linear_costs.py b/src/structsvm/linear_costs.py
index a6ef933..7123ec5 100644
--- a/src/structsvm/linear_costs.py
+++ b/src/structsvm/linear_costs.py
@@ -18,3 +18,9 @@ def get_coefficients(self) -> np.ndarray:
 
     def get_offset(self) -> float:
         return self.offset
+
+    def set_scaling_factor(self, scaling_factor: float) -> None:
+        self.scaling_factor = scaling_factor
+
+    def get_scaling_factor(self) -> float:
+        return self.scaling_factor
diff --git a/src/structsvm/soft_margin_loss.py b/src/structsvm/soft_margin_loss.py
index 3e0cdec..a96718e 100644
--- a/src/structsvm/soft_margin_loss.py
+++ b/src/structsvm/soft_margin_loss.py
@@ -60,9 +60,14 @@ def __init__(
         self._costs = costs
         self._b = self._costs.get_offset()
         self._g = self._costs.get_coefficients()
+        self._scale = self._costs.get_scaling_factor()
+
+        self._b = self._b / self._scale
+        self._g = self._g / self._scale
+        self._features = self._features / self._scale
 
         # combined features of the ground truth and current y*
-        self._d = features @ ground_truth
+        self._d = self._features @ ground_truth
 
         # setup solver
         self._solver = ilpy.Solver(self._num_variables, ilpy.VariableType.Binary)
@@ -106,7 +111,8 @@ def value_and_gradient(self, w: np.ndarray) -> tuple[float, np.ndarray]:
         solution = self._solver.solve()
 
         # read optimal value L(w)
-        value = solution.get_value()
+        # get_value() excludes the constant term, so add it back
+        value = solution.get_value() + a + self._b
 
         # ∂L(w)/∂w = φ(x')y' - φ(x')y*
         #          = d       - e