@@ -15,6 +15,9 @@ def process_node(self, node):
1515 orig_attr [attr ] = getattr (node , attr )
1616 orig_attr ['feats' ] = node .feats .copy ()
1717 orig_attr ['misc' ] = node .misc .copy ()
18+ # Defaults for the newly created MWT
19+ mwt_misc = node .misc .copy ()
20+ mwt_form = node .form
1821
1922 forms = analysis ['form' ].split ()
2023 main = analysis .get ('main' , 0 )
@@ -37,6 +40,7 @@ def process_node(self, node):
3740 elif orig_attr ['form' ][0 ].isupper ():
3841 nodes [0 ].form = nodes [0 ].form .title ()
3942
43+ node .misc = None
4044 for attr in 'lemma upos xpos feats deprel misc' .split ():
4145 if attr in analysis :
4246 values = analysis [attr ].split ()
@@ -47,6 +51,17 @@ def process_node(self, node):
4751 logging .warning ("%s = %s" % (attr , analysis .get (attr , '' )))
4852 if values [i ] == '*' :
4953 setattr (new_node , attr , orig_attr [attr ])
54+ # No MISC attribute should be duplicated on the word level and token level,
55+ # so if copying MISC to a new_node, delete mwt_misc.
56+ # However, SpaceAfter should be annotated only on the token level,
57+ # so make sure it is not accidentally copied on the word level.
58+ if attr == 'misc' :
59+ orig_attr ['misc' ].clear ()
60+ for a in 'SpaceAfter SpacesAfter SpacesBefore' .split ():
61+ if new_node .misc [a ]:
62+ orig_attr ['misc' ][a ] = new_node .misc [a ]
63+ del new_node .misc [a ]
64+
5065 elif attr == 'feats' and '*' in values [i ]:
5166 new_node .feats = values [i ]
5267 for feat_name , feat_value in list (new_node .feats .items ()):
@@ -55,8 +70,23 @@ def process_node(self, node):
5570 else :
5671 setattr (new_node , attr , values [i ])
5772
58- mwt = node .root .create_multiword_token (nodes , orig_attr ['form' ], orig_attr ['misc' ])
59- node .misc = None
73+ # Entity (coreference) annotation should be only on the word level,
74+ # so make sure it does not stay on the token level.
75+ if mwt_misc ['Entity' ]:
76+ nodes [0 ].misc ['Entity' ] = mwt_misc ['Entity' ]
77+ del mwt_misc ['Entity' ]
78+
79+ # If node is already part of an MWT, we need to delete the old MWT and extend the new MWT.
80+ if node .multiword_token :
81+ mwt_words = node .multiword_token .words
82+ mwt_form = node .multiword_token .form
83+ if node .multiword_token .misc :
84+ mwt_misc .update (node .multiword_token .misc )
85+ node .multiword_token .remove ()
86+ mwt_words [mwt_words .index (node ):mwt_words .index (node )+ 1 ] = nodes
87+ nodes = mwt_words
88+
89+ mwt = node .root .create_multiword_token (nodes , mwt_form , mwt_misc )
6090 self .postprocess_mwt (mwt )
6191
6292 def multiword_analysis (self , node ):
0 commit comments