...
Paolo 2007-07-01 Paolo Bonzini <[hidden email]> * iconv/iconvtests.st: Add UTF-7 tests. * iconv/Sets.st: Rewrite FromUTF7. --- orig/packages/iconv/Sets.st +++ mod/packages/iconv/Sets.st @@ -990,9 +995,8 @@ next "Convert to a surrogate pair" ch := ch - 16r10000. left := 32. - value := (((ch bitAnd: 16rFFF) + 16rD800) bitShift: 16) + + value := (((ch bitAnd: 16r3FF) + 16rD800) bitShift: 16) + ((ch bitShift: -10) + 16rDC00). - ^$+ ]. @@ -1154,7 +1158,7 @@ getNext ch == 45 "minus" ifTrue: [ ^$+ ]. "Else switch into base64 mode" - shift := 32. + shift := 16. wch := 0. ]. @@ -1164,7 +1168,7 @@ getNext "Terminate base64 encoding. If accumulated data is nonzero, the input is invalid. Also, partial UTF-16 characters are invalid." - (shift <= 26 or: [ wch > 0 ]) ifTrue: [ + (shift <= 10 or: [ wch > 0 ]) ifTrue: [ shift := 0. InvalidSequenceError signal ]. @@ -1175,43 +1179,34 @@ getNext ^self getNext ]. - "Concatenate the base64 integer value to the accumulator" - shift > 6 ifTrue: [ - shift := shift - 6. - wch := wch + (value bitShift: shift). - shift > 16 ifTrue: [ ^self getNext ]. - - (shift between: 11 and: 16) ifTrue: [ - "Completed an UTF-16 character. When we see a High - Surrogate, we must wait for the following Low Surrogate." - wc1 := wch bitShift: -16. - - (wc1 between: 16rD800 and: 16rDBFF) ifFalse: [ - wch := (wch bitAnd: 65535) bitShift: 16. - shift := shift + 16. - ^Character codePoint: wc1 - ]. - ]. - - (shift between: 5 and: 10) ifTrue: [ + shift <= 6 ifTrue: [ + wc1 := wch + (value bitShift: shift - 6). + wch := ((value bitShift: shift) bitAnd: 16r3F) bitShift: 10. + shift := shift + 10. + + (wc1 between: 16rDC00 and: 16rDFFF) ifTrue: [ + InvalidSequenceError signal ]. + wc1 >= 16r4000000 ifTrue: [ "After an High Surrogate, verify that the next character is indeed a Low Surrogate" - wc1 := wch bitAnd: 65535. - (wc1 between: 16rDC00 and: 16rDFFF) ifFalse: [ - shift := 0. - InvalidSequenceError signal ] - ]. - ]. + (wc1 between: 16r400DC00 and: 16r7FFDFFF) ifTrue: [ + wc1 := ((wc1 bitAnd: 16r3FF0000) bitShift: -6) + + (wc1 bitAnd: 16r3FF) + 16r10000. + ^Character codePoint: wc1 ]. + InvalidSequenceError signal ]. + (wc1 between: 16rD800 and: 16rDBFF) ifFalse: [ + ^Character codePoint: wc1 ]. + shift = 0 ifTrue: [ + InvalidSequenceError signal ]. + + "Read a High Surrogate." + wch := wch + (wc1 - 16rD400 bitShift: 16). + ^self getNext ]. - "Completed an UTF-16 surrogate pair" - - "35FDC00 = -0xD800 << 10 - 0xDC00 + 0x10000" - wc1 := wch bitShift: -16. - wch := (wch bitAnd: 65535) + (value bitShift: shift - 6). - wc1 := (wc1 bitShift: 10) + wch - 16r35FDC00. - - wch := ((value bitShift: shift) bitAnd: 16r3F) bitShift: 26. - ^Character codePoint: wc1 + "Concatenate the base64 integer value to the accumulator" + shift := shift - 6. + wch := wch + (value bitShift: shift). + ^self getNext! ! ! --- orig/packages/iconv/iconvtests.st +++ mod/packages/iconv/iconvtests.st @@ -113,4 +113,32 @@ testByteArrayAsUnicodeStringColon str := #[239 191 190]. self assert: (str asUnicodeString: 'UTF-8') first = $<16rFFFE>. str := #[208 184]. - self assert: (str asUnicodeString: 'UTF-8') first = $<16r438>! ! + self assert: (str asUnicodeString: 'UTF-8') first = $<16r438>! + +testFromUTF7 + self assert: ('+-' asUnicodeString: 'UTF-7') first = $+. + self assert: ('+BBg-' asUnicodeString: 'UTF-7') first = $<16r418>. + self assert: ('+BBgEOA-' asUnicodeString: 'UTF-7') second = $<16r438>. + self assert: ('+BBgEOAQZ-' asUnicodeString: 'UTF-7') third = $<16r419>. + self assert: ('+2//f/w-' asUnicodeString: 'UTF-7') first = $<16r10FFFF>. + self assert: ('+2//f/w-' asUnicodeString: 'UTF-7') size = 1. + self assert: ('+BDjb/9//-' asUnicodeString: 'UTF-7') last = $<16r10FFFF>. + self assert: ('+BDjb/9//-' asUnicodeString: 'UTF-7') size = 2. + self assert: ('+BDgEGNv/3/8-' asUnicodeString: 'UTF-7') last = $<16r10FFFF>. + self assert: ('+BDgEGNv/3/8-' asUnicodeString: 'UTF-7') size = 3! + +testToUTF7 + | str | + self assert: ((UnicodeString with: $+) asString: 'UTF-7') asString = '+-'. + str := UnicodeString with: $<16r418>. + self assert: (str asString: 'UTF-7') encoding = 'UTF-7'. + self assert: (str asString: 'UTF-7') asString = '+BBg-'. + str := str copyWith: $<16r438>. + self assert: (str asString: 'UTF-7') asString = '+BBgEOA-'. + str := str copyWith: $<16r419>. + self assert: (str asString: 'UTF-7') asString = '+BBgEOAQZ-'. + str := UnicodeString with: $<16r10FFFF>. + self assert: (str asString: 'UTF-7') asString = '+2//f/w-'. +! + + ! _______________________________________________ help-smalltalk mailing list [hidden email] http://lists.gnu.org/mailman/listinfo/help-smalltalk |
Free forum by Nabble | Edit this page |