From 301c6037bf3aa2246d434eb9daf6f524f18a9170 Mon Sep 17 00:00:00 2001 From: lovit Date: Wed, 11 Mar 2026 03:04:48 +0900 Subject: [PATCH] =?UTF-8?q?test(hangle):=20ConvolutionHangleEncoder=20?= =?UTF-8?q?=EB=8B=A8=EC=9C=84=20=ED=85=8C=EC=8A=A4=ED=8A=B8=20=EC=B6=94?= =?UTF-8?q?=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - setup_method로 인스턴스 공유 - encode/decode 라운드트립, shape, binary 검증 - 공백/숫자/한글 자모 분해 인코딩 검증 - 비한글 문자 정규화 처리 검증 - dim=80 상수 검증 Co-Authored-By: Claude Sonnet 4.6 --- tests/unit/test_hangle.py | 59 +++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/tests/unit/test_hangle.py b/tests/unit/test_hangle.py index ceb7235f..c3bf70cd 100644 --- a/tests/unit/test_hangle.py +++ b/tests/unit/test_hangle.py @@ -176,14 +176,63 @@ def test_empty(self): class TestConvolutionHangleEncoder: + def setup_method(self): + self.encoder = ConvolutionHangleEncoder() + def test_encode_decode_roundtrip(self): - encoder = ConvolutionHangleEncoder() text = "한글 테스트" - onehot = encoder.sent_to_onehot(text) - decoded = encoder.onehot_to_sent(onehot) + onehot = self.encoder.sent_to_onehot(text) + decoded = self.encoder.onehot_to_sent(onehot) assert decoded == text def test_encode_shape(self): - encoder = ConvolutionHangleEncoder() - x = encoder.encode("한글") + x = self.encoder.encode("한글") assert x.shape == (2, 80) + + def test_encode_is_binary(self): + # 각 행은 one-hot이므로 0 또는 1만 포함 + x = self.encoder.encode("안녕") + assert set(x.flatten().tolist()).issubset({0.0, 1.0}) + + def test_space_encoding(self): + # 한글 사이의 공백은 space 인덱스(78)로 인코딩되어야 함 + onehot = self.encoder.sent_to_onehot("가 나") + # "가 나" → ['가', ' ', '나'] + assert onehot[1] == (self.encoder.space,) + + def test_number_encoding(self): + # 숫자 '0'~'9'가 number_begin(68) 이상 인덱스로 인코딩됨 + for i, digit in enumerate("0123456789"): + onehot = self.encoder.sent_to_onehot(digit) + assert onehot[0] == (self.encoder.number_begin + i,) + + def test_korean_character_decomposition(self): + # '한' → (초성, 중성, 종성) 3-tuple + onehot = self.encoder.sent_to_onehot("한") + assert len(onehot) == 1 + assert len(onehot[0]) == 3 + cho, jung, jong = onehot[0] + assert 0 <= cho < self.encoder.jung_begin + assert self.encoder.jung_begin <= jung < self.encoder.jong_begin + assert self.encoder.jong_begin <= jong < self.encoder.number_begin + + def test_non_korean_normalized(self): + # 한글·숫자·공백 이외 문자는 공백으로 치환됨 (normalize 후 strip) + onehot = self.encoder.sent_to_onehot("abc") + # "abc" → " " (공백으로 치환 후 strip → "")이 되므로 빈 리스트 + assert onehot == [] + + def test_encode_length_matches_chars(self): + text = "한글 테스트" + x = self.encoder.encode(text) + onehot = self.encoder.sent_to_onehot(text) + assert x.shape[0] == len(onehot) + assert x.shape[1] == self.encoder.dim + + def test_dim_is_80(self): + assert self.encoder.dim == 80 + + def test_encode_decode_roundtrip_with_space(self): + text = "나는 학생" + decoded = self.encoder.onehot_to_sent(self.encoder.sent_to_onehot(text)) + assert decoded == text