[PATCH] fix utf7

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

[PATCH] fix utf7

Paolo Bonzini
...

Paolo

2007-07-01  Paolo Bonzini  <[hidden email]>

        * iconv/iconvtests.st: Add UTF-7 tests.
        * iconv/Sets.st: Rewrite FromUTF7.
 


--- orig/packages/iconv/Sets.st
+++ mod/packages/iconv/Sets.st
@@ -990,9 +995,8 @@ next
  "Convert to a surrogate pair"
  ch := ch - 16r10000.
  left := 32.
- value := (((ch bitAnd: 16rFFF) + 16rD800) bitShift: 16) +
+ value := (((ch bitAnd: 16r3FF) + 16rD800) bitShift: 16) +
  ((ch bitShift: -10) + 16rDC00).
-
  ^$+
     ].
 
@@ -1154,7 +1158,7 @@ getNext
  ch == 45 "minus" ifTrue: [ ^$+ ].
 
  "Else switch into base64 mode"
- shift := 32.
+ shift := 16.
  wch := 0.
     ].
 
@@ -1164,7 +1168,7 @@ getNext
     "Terminate base64 encoding.
      If accumulated data is nonzero, the input is invalid.
      Also, partial UTF-16 characters are invalid."
-    (shift <= 26 or: [ wch > 0 ]) ifTrue: [
+    (shift <= 10 or: [ wch > 0 ]) ifTrue: [
  shift := 0.
  InvalidSequenceError signal ].
 
@@ -1175,43 +1179,34 @@ getNext
     ^self getNext
  ].
 
-    "Concatenate the base64 integer value to the accumulator"
-    shift > 6 ifTrue: [
- shift := shift - 6.
- wch := wch + (value bitShift: shift).
- shift > 16 ifTrue: [ ^self getNext ].
-
- (shift between: 11 and: 16) ifTrue: [
-    "Completed an UTF-16 character.  When we see a High
-     Surrogate, we must wait for the following Low Surrogate."
-    wc1 := wch bitShift: -16.
-
-    (wc1 between: 16rD800 and: 16rDBFF) ifFalse: [
- wch := (wch bitAnd: 65535) bitShift: 16.
- shift := shift + 16.
- ^Character codePoint: wc1
-    ].
- ].
-
- (shift between: 5 and: 10) ifTrue: [
+    shift <= 6 ifTrue: [
+        wc1 := wch + (value bitShift: shift - 6).
+        wch := ((value bitShift: shift) bitAnd: 16r3F) bitShift: 10.
+        shift := shift + 10.
+
+ (wc1 between: 16rDC00 and: 16rDFFF) ifTrue: [
+    InvalidSequenceError signal ].
+ wc1 >= 16r4000000 ifTrue: [
     "After an High Surrogate, verify that the next character
      is indeed a Low Surrogate"
-    wc1 := wch bitAnd: 65535.
-    (wc1 between: 16rDC00 and: 16rDFFF) ifFalse: [
- shift := 0.
- InvalidSequenceError signal ]
- ].
-    ].
+    (wc1 between: 16r400DC00 and: 16r7FFDFFF) ifTrue: [
+        wc1 := ((wc1 bitAnd: 16r3FF0000) bitShift: -6)
+       + (wc1 bitAnd: 16r3FF) + 16r10000.
+        ^Character codePoint: wc1 ].
+    InvalidSequenceError signal ].
+ (wc1 between: 16rD800 and: 16rDBFF) ifFalse: [
+    ^Character codePoint: wc1 ].
+ shift = 0 ifTrue: [
+    InvalidSequenceError signal ].
+
+        "Read a High Surrogate."
+ wch := wch + (wc1 - 16rD400 bitShift: 16).
+ ^self getNext ].
 
-    "Completed an UTF-16 surrogate pair"
-
-    "35FDC00 = -0xD800 << 10 - 0xDC00 + 0x10000"
-    wc1 := wch bitShift: -16.
-    wch := (wch bitAnd: 65535) + (value bitShift: shift - 6).
-    wc1 := (wc1 bitShift: 10) + wch - 16r35FDC00.
-
-    wch := ((value bitShift: shift) bitAnd: 16r3F) bitShift: 26.
-    ^Character codePoint: wc1
+    "Concatenate the base64 integer value to the accumulator"
+    shift := shift - 6.
+    wch := wch + (value bitShift: shift).
+    ^self getNext!
 ! !
 
 


--- orig/packages/iconv/iconvtests.st
+++ mod/packages/iconv/iconvtests.st
@@ -113,4 +113,32 @@ testByteArrayAsUnicodeStringColon
     str := #[239 191 190].
     self assert: (str asUnicodeString: 'UTF-8') first = $<16rFFFE>.
     str := #[208 184].
-    self assert: (str asUnicodeString: 'UTF-8') first = $<16r438>! !
+    self assert: (str asUnicodeString: 'UTF-8') first = $<16r438>!
+
+testFromUTF7
+    self assert: ('+-' asUnicodeString: 'UTF-7') first = $+.
+    self assert: ('+BBg-' asUnicodeString: 'UTF-7') first = $<16r418>.
+    self assert: ('+BBgEOA-' asUnicodeString: 'UTF-7') second = $<16r438>.
+    self assert: ('+BBgEOAQZ-' asUnicodeString: 'UTF-7') third = $<16r419>.
+    self assert: ('+2//f/w-' asUnicodeString: 'UTF-7') first = $<16r10FFFF>.
+    self assert: ('+2//f/w-' asUnicodeString: 'UTF-7') size = 1.
+    self assert: ('+BDjb/9//-' asUnicodeString: 'UTF-7') last = $<16r10FFFF>.
+    self assert: ('+BDjb/9//-' asUnicodeString: 'UTF-7') size = 2.
+    self assert: ('+BDgEGNv/3/8-' asUnicodeString: 'UTF-7') last = $<16r10FFFF>.
+    self assert: ('+BDgEGNv/3/8-' asUnicodeString: 'UTF-7') size = 3!
+
+testToUTF7
+    | str |
+    self assert: ((UnicodeString with: $+) asString: 'UTF-7') asString = '+-'.
+    str := UnicodeString with: $<16r418>.
+    self assert: (str asString: 'UTF-7') encoding = 'UTF-7'.
+    self assert: (str asString: 'UTF-7') asString = '+BBg-'.
+    str := str copyWith: $<16r438>.
+    self assert: (str asString: 'UTF-7') asString = '+BBgEOA-'.
+    str := str copyWith: $<16r419>.
+    self assert: (str asString: 'UTF-7') asString = '+BBgEOAQZ-'.
+    str := UnicodeString with: $<16r10FFFF>.
+    self assert: (str asString: 'UTF-7') asString = '+2//f/w-'.
+!
+
+ !




_______________________________________________
help-smalltalk mailing list
[hidden email]
http://lists.gnu.org/mailman/listinfo/help-smalltalk