2 * Copyright (C) 2005, 2007, 2008, 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2006 Alexey Proskuryakov (ap@nypop.com)
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of
15 * its contributors may be used to endorse or promote products derived
16 * from this software without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
19 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
22 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 #import "WebNSURLExtras.h"
32 #import "WebKitNSStringExtras.h"
33 #import "WebLocalizableStrings.h"
34 #import "WebNSDataExtras.h"
35 #import "WebNSObjectExtras.h"
36 #import "WebSystemInterface.h"
37 #import <Foundation/NSURLRequest.h>
38 #import <WebCore/KURL.h>
39 #import <WebCore/LoaderNSURLExtras.h>
40 #import <WebKitSystemInterface.h>
41 #import <wtf/Assertions.h>
42 #import <unicode/uchar.h>
43 #import <unicode/uidna.h>
44 #import <unicode/uscript.h>
46 using namespace WebCore;
49 typedef void (* StringRangeApplierFunction)(NSString *string, NSRange range, void *context);
51 // Needs to be big enough to hold an IDN-encoded name.
52 // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
53 #define HOST_NAME_BUFFER_LENGTH 2048
55 #define URL_BYTES_BUFFER_LENGTH 2048
57 static pthread_once_t IDNScriptWhiteListFileRead = PTHREAD_ONCE_INIT;
58 static uint32_t IDNScriptWhiteList[(USCRIPT_CODE_LIMIT + 31) / 32];
60 static inline BOOL isLookalikeCharacter(int charCode)
62 // FIXME: Move this code down into WebCore so it can be shared with other platforms.
64 // This function treats the following as unsafe, lookalike characters:
65 // any non-printable character, any character considered as whitespace that isn't already converted to a space by ICU,
66 // and any ignorable character.
68 // We also considered the characters in Mozilla's blacklist (http://kb.mozillazine.org/Network.IDN.blacklist_chars),
69 // and included all of these characters that ICU can encode.
71 if (!u_isprint(charCode) || u_isUWhiteSpace(charCode) || u_hasBinaryProperty(charCode, UCHAR_DEFAULT_IGNORABLE_CODE_POINT))
75 case 0x00ED: /* LATIN SMALL LETTER I WITH ACUTE */
76 case 0x01C3: /* LATIN LETTER RETROFLEX CLICK */
77 case 0x0251: /* LATIN SMALL LETTER ALPHA */
78 case 0x0261: /* LATIN SMALL LETTER SCRIPT G */
79 case 0x0337: /* COMBINING SHORT SOLIDUS OVERLAY */
80 case 0x0338: /* COMBINING LONG SOLIDUS OVERLAY */
81 case 0x05B4: /* HEBREW POINT HIRIQ */
82 case 0x05BC: /* HEBREW POINT DAGESH OR MAPIQ */
83 case 0x05C3: /* HEBREW PUNCTUATION SOF PASUQ */
84 case 0x05F4: /* HEBREW PUNCTUATION GERSHAYIM */
85 case 0x0660: /* ARABIC INDIC DIGIT ZERO */
86 case 0x06D4: /* ARABIC FULL STOP */
87 case 0x06F0: /* EXTENDED ARABIC INDIC DIGIT ZERO */
88 case 0x2027: /* HYPHENATION POINT */
89 case 0x2039: /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
90 case 0x203A: /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
91 case 0x2044: /* FRACTION SLASH */
92 case 0x2215: /* DIVISION SLASH */
93 case 0x2216: /* SET MINUS */
94 case 0x233F: /* APL FUNCTIONAL SYMBOL SLASH BAR */
95 case 0x23AE: /* INTEGRAL EXTENSION */
96 case 0x244A: /* OCR DOUBLE BACKSLASH */
97 case 0x2571: /* BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT */
98 case 0x2572: /* BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT */
99 case 0x29F8: /* BIG SOLIDUS */
100 case 0x29f6: /* SOLIDUS WITH OVERBAR */
101 case 0x2AFB: /* TRIPLE SOLIDUS BINARY RELATION */
102 case 0x2AFD: /* DOUBLE SOLIDUS OPERATOR */
103 case 0x3008: /* LEFT ANGLE BRACKET */
104 case 0x3014: /* LEFT TORTOISE SHELL BRACKET */
105 case 0x3015: /* RIGHT TORTOISE SHELL BRACKET */
106 case 0x3033: /* VERTICAL KANA REPEAT MARK UPPER HALF */
107 case 0x3035: /* VERTICAL KANA REPEAT MARK LOWER HALF */
108 case 0x321D: /* PARENTHESIZED KOREAN CHARACTER OJEON */
109 case 0x321E: /* PARENTHESIZED KOREAN CHARACTER O HU */
110 case 0x33DF: /* SQUARE A OVER M */
111 case 0xFE14: /* PRESENTATION FORM FOR VERTICAL SEMICOLON */
112 case 0xFE15: /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
113 case 0xFE3F: /* PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET */
114 case 0xFE5D: /* SMALL LEFT TORTOISE SHELL BRACKET */
115 case 0xFE5E: /* SMALL RIGHT TORTOISE SHELL BRACKET */
122 static char hexDigit(int i)
124 if (i < 0 || i > 16) {
125 LOG_ERROR("illegal hex digit");
138 static BOOL isHexDigit(char c)
140 return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
143 static int hexDigitValue(char c)
145 if (c >= '0' && c <= '9') {
148 if (c >= 'A' && c <= 'F') {
151 if (c >= 'a' && c <= 'f') {
154 LOG_ERROR("illegal hex digit");
158 static void applyHostNameFunctionToMailToURLString(NSString *string, StringRangeApplierFunction f, void *context)
160 // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' character.
161 // Skip quoted strings so that characters in them don't confuse us.
162 // When we find a '?' character, we are past the part of the URL that contains host names.
164 static NSCharacterSet *hostNameOrStringStartCharacters;
165 if (hostNameOrStringStartCharacters == nil) {
166 hostNameOrStringStartCharacters = [NSCharacterSet characterSetWithCharactersInString:@"\"@?"];
167 CFRetain(hostNameOrStringStartCharacters);
169 static NSCharacterSet *hostNameEndCharacters;
170 if (hostNameEndCharacters == nil) {
171 hostNameEndCharacters = [NSCharacterSet characterSetWithCharactersInString:@">,?"];
172 CFRetain(hostNameEndCharacters);
174 static NSCharacterSet *quotedStringCharacters;
175 if (quotedStringCharacters == nil) {
176 quotedStringCharacters = [NSCharacterSet characterSetWithCharactersInString:@"\"\\"];
177 CFRetain(quotedStringCharacters);
180 unsigned stringLength = [string length];
181 NSRange remaining = NSMakeRange(0, stringLength);
184 // Find start of host name or of quoted string.
185 NSRange hostNameOrStringStart = [string rangeOfCharacterFromSet:hostNameOrStringStartCharacters options:0 range:remaining];
186 if (hostNameOrStringStart.location == NSNotFound) {
189 unichar c = [string characterAtIndex:hostNameOrStringStart.location];
190 remaining.location = NSMaxRange(hostNameOrStringStart);
191 remaining.length = stringLength - remaining.location;
198 // Find end of host name.
199 unsigned hostNameStart = remaining.location;
200 NSRange hostNameEnd = [string rangeOfCharacterFromSet:hostNameEndCharacters options:0 range:remaining];
202 if (hostNameEnd.location == NSNotFound) {
203 hostNameEnd.location = stringLength;
206 remaining.location = hostNameEnd.location;
207 remaining.length = stringLength - remaining.location;
211 // Process host name range.
212 f(string, NSMakeRange(hostNameStart, hostNameEnd.location - hostNameStart), context);
218 // Skip quoted string.
221 NSRange escapedCharacterOrStringEnd = [string rangeOfCharacterFromSet:quotedStringCharacters options:0 range:remaining];
222 if (escapedCharacterOrStringEnd.location == NSNotFound) {
225 c = [string characterAtIndex:escapedCharacterOrStringEnd.location];
226 remaining.location = NSMaxRange(escapedCharacterOrStringEnd);
227 remaining.length = stringLength - remaining.location;
229 // If we are the end of the string, then break from the string loop back to the host name loop.
234 // Skip escaped character.
236 if (remaining.length == 0) {
239 remaining.location += 1;
240 remaining.length -= 1;
246 static void applyHostNameFunctionToURLString(NSString *string, StringRangeApplierFunction f, void *context)
248 // Find hostnames. Too bad we can't use any real URL-parsing code to do this,
249 // but we have to do it before doing all the %-escaping, and this is the only
250 // code we have that parses mailto URLs anyway.
252 // Maybe we should implement this using a character buffer instead?
254 if ([string _webkit_hasCaseInsensitivePrefix:@"mailto:"]) {
255 applyHostNameFunctionToMailToURLString(string, f, context);
259 // Find the host name in a hierarchical URL.
260 // It comes after a "://" sequence, with scheme characters preceding.
261 // If ends with the end of the string or a ":", "/", or a "?".
262 // If there is a "@" character, the host part is just the part after the "@".
263 NSRange separatorRange = [string rangeOfString:@"://"];
264 if (separatorRange.location == NSNotFound) {
268 // Check that all characters before the :// are valid scheme characters.
269 static NSCharacterSet *nonSchemeCharacters;
270 if (nonSchemeCharacters == nil) {
271 nonSchemeCharacters = [[NSCharacterSet characterSetWithCharactersInString:@"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-."] invertedSet];
272 CFRetain(nonSchemeCharacters);
274 if ([string rangeOfCharacterFromSet:nonSchemeCharacters options:0 range:NSMakeRange(0, separatorRange.location)].location != NSNotFound) {
278 unsigned stringLength = [string length];
280 static NSCharacterSet *hostTerminators;
281 if (hostTerminators == nil) {
282 hostTerminators = [NSCharacterSet characterSetWithCharactersInString:@":/?#"];
283 CFRetain(hostTerminators);
286 // Start after the separator.
287 unsigned authorityStart = NSMaxRange(separatorRange);
289 // Find terminating character.
290 NSRange hostNameTerminator = [string rangeOfCharacterFromSet:hostTerminators options:0 range:NSMakeRange(authorityStart, stringLength - authorityStart)];
291 unsigned hostNameEnd = hostNameTerminator.location == NSNotFound ? stringLength : hostNameTerminator.location;
293 // Find "@" for the start of the host name.
294 NSRange userInfoTerminator = [string rangeOfString:@"@" options:0 range:NSMakeRange(authorityStart, hostNameEnd - authorityStart)];
295 unsigned hostNameStart = userInfoTerminator.location == NSNotFound ? authorityStart : NSMaxRange(userInfoTerminator);
297 f(string, NSMakeRange(hostNameStart, hostNameEnd - hostNameStart), context);
300 @implementation NSURL (WebNSURLExtras)
302 static void collectRangesThatNeedMapping(NSString *string, NSRange range, void *context, BOOL encode)
304 BOOL needsMapping = encode
305 ? [string _web_hostNameNeedsEncodingWithRange:range]
306 : [string _web_hostNameNeedsDecodingWithRange:range];
311 NSMutableArray **array = (NSMutableArray **)context;
313 *array = [[NSMutableArray alloc] init];
316 [*array addObject:[NSValue valueWithRange:range]];
319 static void collectRangesThatNeedEncoding(NSString *string, NSRange range, void *context)
321 return collectRangesThatNeedMapping(string, range, context, YES);
324 static void collectRangesThatNeedDecoding(NSString *string, NSRange range, void *context)
326 return collectRangesThatNeedMapping(string, range, context, NO);
329 static NSString *mapHostNames(NSString *string, BOOL encode)
331 // Generally, we want to optimize for the case where there is one host name that does not need mapping.
333 if (encode && [string canBeConvertedToEncoding:NSASCIIStringEncoding])
336 // Make a list of ranges that actually need mapping.
337 NSMutableArray *hostNameRanges = nil;
338 StringRangeApplierFunction f = encode
339 ? collectRangesThatNeedEncoding
340 : collectRangesThatNeedDecoding;
341 applyHostNameFunctionToURLString(string, f, &hostNameRanges);
342 if (hostNameRanges == nil)
346 NSMutableString *mutableCopy = [string mutableCopy];
347 unsigned i = [hostNameRanges count];
349 NSRange hostNameRange = [[hostNameRanges objectAtIndex:i] rangeValue];
350 NSString *mappedHostName = encode
351 ? [string _web_encodeHostNameWithRange:hostNameRange]
352 : [string _web_decodeHostNameWithRange:hostNameRange];
353 [mutableCopy replaceCharactersInRange:hostNameRange withString:mappedHostName];
355 [hostNameRanges release];
356 return [mutableCopy autorelease];
359 + (NSURL *)_web_URLWithUserTypedString:(NSString *)string relativeToURL:(NSURL *)URL
364 string = mapHostNames([string _webkit_stringByTrimmingWhitespace], YES);
366 NSData *userTypedData = [string dataUsingEncoding:NSUTF8StringEncoding];
367 ASSERT(userTypedData);
369 const UInt8 *inBytes = static_cast<const UInt8 *>([userTypedData bytes]);
370 int inLength = [userTypedData length];
372 return [NSURL URLWithString:@""];
375 char *outBytes = static_cast<char *>(malloc(inLength * 3)); // large enough to %-escape every character
379 for (i = 0; i < inLength; i++) {
380 UInt8 c = inBytes[i];
381 if (c <= 0x20 || c >= 0x7f) {
383 *p++ = hexDigit(c >> 4);
384 *p++ = hexDigit(c & 0xf);
393 NSData *data = [NSData dataWithBytesNoCopy:outBytes length:outLength]; // adopts outBytes
394 return [self _web_URLWithData:data relativeToURL:URL];
397 + (NSURL *)_web_URLWithUserTypedString:(NSString *)string
399 return [self _web_URLWithUserTypedString:string relativeToURL:nil];
402 + (NSURL *)_web_URLWithDataAsString:(NSString *)string
407 return [self _web_URLWithDataAsString:string relativeToURL:nil];
410 + (NSURL *)_web_URLWithDataAsString:(NSString *)string relativeToURL:(NSURL *)baseURL
415 string = [string _webkit_stringByTrimmingWhitespace];
416 NSData *data = [string dataUsingEncoding:NSISOLatin1StringEncoding];
417 return [self _web_URLWithData:data relativeToURL:baseURL];
420 + (NSURL *)_web_URLWithData:(NSData *)data
422 return [NSURL _web_URLWithData:data relativeToURL:nil];
425 + (NSURL *)_web_URLWithData:(NSData *)data relativeToURL:(NSURL *)baseURL
431 size_t length = [data length];
433 // work around <rdar://4470771>: CFURLCreateAbsoluteURLWithBytes(.., TRUE) doesn't remove non-path components.
434 baseURL = [baseURL _webkit_URLByRemovingResourceSpecifier];
436 const UInt8 *bytes = static_cast<const UInt8*>([data bytes]);
437 // NOTE: We use UTF-8 here since this encoding is used when computing strings when returning URL components
438 // (e.g calls to NSURL -path). However, this function is not tolerant of illegal UTF-8 sequences, which
439 // could either be a malformed string or bytes in a different encoding, like shift-jis, so we fall back
440 // onto using ISO Latin 1 in those cases.
441 result = WebCFAutorelease(CFURLCreateAbsoluteURLWithBytes(NULL, bytes, length, kCFStringEncodingUTF8, (CFURLRef)baseURL, YES));
443 result = WebCFAutorelease(CFURLCreateAbsoluteURLWithBytes(NULL, bytes, length, kCFStringEncodingISOLatin1, (CFURLRef)baseURL, YES));
445 result = [NSURL URLWithString:@""];
450 - (NSData *)_web_originalData
452 UInt8 *buffer = (UInt8 *)malloc(URL_BYTES_BUFFER_LENGTH);
453 CFIndex bytesFilled = CFURLGetBytes((CFURLRef)self, buffer, URL_BYTES_BUFFER_LENGTH);
454 if (bytesFilled == -1) {
455 CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)self, NULL, 0);
456 buffer = (UInt8 *)realloc(buffer, bytesToAllocate);
457 bytesFilled = CFURLGetBytes((CFURLRef)self, buffer, bytesToAllocate);
458 ASSERT(bytesFilled == bytesToAllocate);
461 // buffer is adopted by the NSData
462 NSData *data = [NSData dataWithBytesNoCopy:buffer length:bytesFilled freeWhenDone:YES];
464 NSURL *baseURL = (NSURL *)CFURLGetBaseURL((CFURLRef)self);
466 return [[NSURL _web_URLWithData:data relativeToURL:baseURL] _web_originalData];
470 - (NSString *)_web_originalDataAsString
472 return [[[NSString alloc] initWithData:[self _web_originalData] encoding:NSISOLatin1StringEncoding] autorelease];
475 static CFStringRef createStringWithEscapedUnsafeCharacters(CFStringRef string)
477 CFIndex length = CFStringGetLength(string);
478 Vector<UChar, 2048> sourceBuffer(length);
479 CFStringGetCharacters(string, CFRangeMake(0, length), sourceBuffer.data());
481 Vector<UChar, 2048> outBuffer;
486 U16_NEXT(sourceBuffer, i, length, c)
488 if (isLookalikeCharacter(c)) {
489 uint8_t utf8Buffer[4];
491 UBool failure = false;
492 U8_APPEND(utf8Buffer, offset, 4, c, failure)
495 for (CFIndex j = 0; j < offset; ++j) {
496 outBuffer.append('%');
497 outBuffer.append(hexDigit(utf8Buffer[j] >> 4));
498 outBuffer.append(hexDigit(utf8Buffer[j] & 0xf));
501 UChar utf16Buffer[2];
503 UBool failure = false;
504 U16_APPEND(utf16Buffer, offset, 2, c, failure)
506 for (CFIndex j = 0; j < offset; ++j)
507 outBuffer.append(utf16Buffer[j]);
511 return CFStringCreateWithCharacters(NULL, outBuffer.data(), outBuffer.size());
514 - (NSString *)_web_userVisibleString
516 NSData *data = [self _web_originalData];
517 const unsigned char *before = static_cast<const unsigned char*>([data bytes]);
518 int length = [data length];
520 bool needsHostNameDecoding = false;
522 const unsigned char *p = before;
523 int bufferLength = (length * 3) + 1;
524 char *after = static_cast<char *>(malloc(bufferLength)); // large enough to %-escape every character
527 for (i = 0; i < length; i++) {
528 unsigned char c = p[i];
529 // unescape escape sequences that indicate bytes greater than 0x7f
530 if (c == '%' && (i + 1 < length && isHexDigit(p[i + 1])) && i + 2 < length && isHexDigit(p[i + 2])) {
531 unsigned char u = (hexDigitValue(p[i + 1]) << 4) | hexDigitValue(p[i + 2]);
545 // Check for "xn--" in an efficient, non-case-sensitive, way.
546 if (c == '-' && i >= 3 && !needsHostNameDecoding && (q[-4] | 0x20) == 'x' && (q[-3] | 0x20) == 'n' && q[-2] == '-')
547 needsHostNameDecoding = true;
552 // Check string to see if it can be converted to display using UTF-8
553 NSString *result = [NSString stringWithUTF8String:after];
555 // Could not convert to UTF-8.
556 // Convert characters greater than 0x7f to escape sequences.
557 // Shift current string to the end of the buffer
558 // then we will copy back bytes to the start of the buffer
560 int afterlength = q - after;
561 char *p = after + bufferLength - afterlength - 1;
562 memmove(p, after, afterlength + 1); // copies trailing '\0'
565 unsigned char c = *p;
568 *q++ = hexDigit(c >> 4);
569 *q++ = hexDigit(c & 0xf);
576 result = [NSString stringWithUTF8String:after];
581 result = mapHostNames(result, !needsHostNameDecoding);
582 result = [result precomposedStringWithCanonicalMapping];
583 return WebCFAutorelease(createStringWithEscapedUnsafeCharacters((CFStringRef)result));
588 if (!CFURLGetBaseURL((CFURLRef)self))
589 return CFURLGetBytes((CFURLRef)self, NULL, 0) == 0;
590 return [[self _web_originalData] length] == 0;
593 - (const char *)_web_URLCString
595 NSMutableData *data = [NSMutableData data];
596 [data appendData:[self _web_originalData]];
597 [data appendBytes:"\0" length:1];
598 return (const char *)[data bytes];
601 - (NSURL *)_webkit_canonicalize
603 NSURLRequest *request = [[NSURLRequest alloc] initWithURL:self];
604 Class concreteClass = WKNSURLProtocolClassForRequest(request);
605 if (!concreteClass) {
610 // This applies NSURL's concept of canonicalization, but not KURL's concept. It would
611 // make sense to apply both, but when we tried that it caused a performance degradation
612 // (see 5315926). It might make sense to apply only the KURL concept and not the NSURL
613 // concept, but it's too risky to make that change for WebKit 3.0.
614 NSURLRequest *newRequest = [concreteClass canonicalRequestForRequest:request];
615 NSURL *newURL = [newRequest URL];
616 NSURL *result = [[newURL retain] autorelease];
622 - (NSURL *)_web_URLByTruncatingOneCharacterBeforeComponent:(CFURLComponentType)component
624 CFRange fragRg = CFURLGetByteRangeForComponent((CFURLRef)self, component, NULL);
625 if (fragRg.location == kCFNotFound)
628 UInt8 *urlBytes, buffer[2048];
629 CFIndex numBytes = CFURLGetBytes((CFURLRef)self, buffer, 2048);
630 if (numBytes == -1) {
631 numBytes = CFURLGetBytes((CFURLRef)self, NULL, 0);
632 urlBytes = static_cast<UInt8*>(malloc(numBytes));
633 CFURLGetBytes((CFURLRef)self, urlBytes, numBytes);
637 NSURL *result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, fragRg.location - 1, kCFStringEncodingUTF8, NULL));
639 result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, fragRg.location - 1, kCFStringEncodingISOLatin1, NULL));
641 if (urlBytes != buffer) free(urlBytes);
642 return result ? [result autorelease] : self;
645 - (NSURL *)_webkit_URLByRemovingFragment
647 return [self _web_URLByTruncatingOneCharacterBeforeComponent:kCFURLComponentFragment];
650 - (NSURL *)_webkit_URLByRemovingResourceSpecifier
652 return [self _web_URLByTruncatingOneCharacterBeforeComponent:kCFURLComponentResourceSpecifier];
655 - (NSURL *)_web_URLByRemovingComponentAndSubsequentCharacter:(CFURLComponentType)component
657 CFRange range = CFURLGetByteRangeForComponent((CFURLRef)self, component, 0);
658 if (range.location == kCFNotFound)
661 // Remove one subsequent character.
666 CFIndex numBytes = CFURLGetBytes((CFURLRef)self, buffer, 2048);
667 if (numBytes == -1) {
668 numBytes = CFURLGetBytes((CFURLRef)self, NULL, 0);
669 urlBytes = static_cast<UInt8*>(malloc(numBytes));
670 CFURLGetBytes((CFURLRef)self, urlBytes, numBytes);
674 if (numBytes < range.location)
676 if (numBytes < range.location + range.length)
677 range.length = numBytes - range.location;
679 memmove(urlBytes + range.location, urlBytes + range.location + range.length, numBytes - range.location + range.length);
681 NSURL *result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, numBytes - range.length, kCFStringEncodingUTF8, NULL));
683 result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, numBytes - range.length, kCFStringEncodingISOLatin1, NULL));
685 if (urlBytes != buffer)
688 return result ? [result autorelease] : self;
691 - (NSURL *)_web_URLByRemovingUserInfo
693 return [self _web_URLByRemovingComponentAndSubsequentCharacter:kCFURLComponentUserInfo];
696 - (BOOL)_webkit_isJavaScriptURL
698 return [[self _web_originalDataAsString] _webkit_isJavaScriptURL];
701 - (NSString *)_webkit_scriptIfJavaScriptURL
703 return [[self absoluteString] _webkit_scriptIfJavaScriptURL];
706 - (BOOL)_webkit_isFileURL
708 return [[self _web_originalDataAsString] _webkit_isFileURL];
711 - (BOOL)_webkit_isFTPDirectoryURL
713 return [[self _web_originalDataAsString] _webkit_isFTPDirectoryURL];
716 - (BOOL)_webkit_shouldLoadAsEmptyDocument
718 return [[self _web_originalDataAsString] _webkit_hasCaseInsensitivePrefix:@"about:"] || [self _web_isEmpty];
721 - (NSURL *)_web_URLWithLowercasedScheme
724 CFURLGetByteRangeForComponent((CFURLRef)self, kCFURLComponentScheme, &range);
725 if (range.location == kCFNotFound) {
729 UInt8 static_buffer[URL_BYTES_BUFFER_LENGTH];
730 UInt8 *buffer = static_buffer;
731 CFIndex bytesFilled = CFURLGetBytes((CFURLRef)self, buffer, URL_BYTES_BUFFER_LENGTH);
732 if (bytesFilled == -1) {
733 CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)self, NULL, 0);
734 buffer = static_cast<UInt8 *>(malloc(bytesToAllocate));
735 bytesFilled = CFURLGetBytes((CFURLRef)self, buffer, bytesToAllocate);
736 ASSERT(bytesFilled == bytesToAllocate);
741 for (i = 0; i < range.length; ++i) {
742 char c = buffer[range.location + i];
743 char lower = toASCIILower(c);
745 buffer[range.location + i] = lower;
750 NSURL *result = changed
751 ? (NSURL *)WebCFAutorelease(CFURLCreateAbsoluteURLWithBytes(NULL, buffer, bytesFilled, kCFStringEncodingUTF8, nil, YES))
754 if (buffer != static_buffer) {
762 -(BOOL)_web_hasQuestionMarkOnlyQueryString
764 CFRange rangeWithSeparators;
765 CFURLGetByteRangeForComponent((CFURLRef)self, kCFURLComponentQuery, &rangeWithSeparators);
766 if (rangeWithSeparators.location != kCFNotFound && rangeWithSeparators.length == 1) {
772 -(NSData *)_web_schemeSeparatorWithoutColon
774 NSData *result = nil;
775 CFRange rangeWithSeparators;
776 CFRange range = CFURLGetByteRangeForComponent((CFURLRef)self, kCFURLComponentScheme, &rangeWithSeparators);
777 if (rangeWithSeparators.location != kCFNotFound) {
778 NSString *absoluteString = [self absoluteString];
779 NSRange separatorsRange = NSMakeRange(range.location + range.length + 1, rangeWithSeparators.length - range.length - 1);
780 if (separatorsRange.location + separatorsRange.length <= [absoluteString length]) {
781 NSString *slashes = [absoluteString substringWithRange:separatorsRange];
782 result = [slashes dataUsingEncoding:NSISOLatin1StringEncoding];
788 #define completeURL (CFURLComponentType)-1
790 -(NSData *)_web_dataForURLComponentType:(CFURLComponentType)componentType
792 static int URLComponentTypeBufferLength = 2048;
794 UInt8 staticAllBytesBuffer[URLComponentTypeBufferLength];
795 UInt8 *allBytesBuffer = staticAllBytesBuffer;
797 CFIndex bytesFilled = CFURLGetBytes((CFURLRef)self, allBytesBuffer, URLComponentTypeBufferLength);
798 if (bytesFilled == -1) {
799 CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)self, NULL, 0);
800 allBytesBuffer = static_cast<UInt8 *>(malloc(bytesToAllocate));
801 bytesFilled = CFURLGetBytes((CFURLRef)self, allBytesBuffer, bytesToAllocate);
805 if (componentType != completeURL) {
806 range = CFURLGetByteRangeForComponent((CFURLRef)self, componentType, NULL);
807 if (range.location == kCFNotFound) {
813 range.length = bytesFilled;
816 NSData *componentData = [NSData dataWithBytes:allBytesBuffer + range.location length:range.length];
818 const unsigned char *bytes = static_cast<const unsigned char *>([componentData bytes]);
819 NSMutableData *resultData = [NSMutableData data];
820 // NOTE: add leading '?' to query strings non-zero length query strings.
821 // NOTE: retain question-mark only query strings.
822 if (componentType == kCFURLComponentQuery) {
823 if (range.length > 0 || [self _web_hasQuestionMarkOnlyQueryString]) {
824 [resultData appendBytes:"?" length:1];
828 for (i = 0; i < range.length; i++) {
829 unsigned char c = bytes[i];
830 if (c <= 0x20 || c >= 0x7f) {
833 escaped[1] = hexDigit(c >> 4);
834 escaped[2] = hexDigit(c & 0xf);
835 [resultData appendBytes:escaped length:3];
840 [resultData appendBytes:b length:1];
844 if (staticAllBytesBuffer != allBytesBuffer) {
845 free(allBytesBuffer);
851 -(NSData *)_web_schemeData
853 return [self _web_dataForURLComponentType:kCFURLComponentScheme];
856 -(NSData *)_web_hostData
858 NSData *result = [self _web_dataForURLComponentType:kCFURLComponentHost];
859 NSData *scheme = [self _web_schemeData];
860 // Take off localhost for file
861 if ([scheme _web_isCaseInsensitiveEqualToCString:"file"]) {
862 return ([result _web_isCaseInsensitiveEqualToCString:"localhost"]) ? nil : result;
867 - (NSString *)_web_hostString
869 NSData *data = [self _web_hostData];
871 data = [NSData data];
873 return [[[NSString alloc] initWithData:[self _web_hostData] encoding:NSUTF8StringEncoding] autorelease];
876 - (NSString *)_webkit_suggestedFilenameWithMIMEType:(NSString *)MIMEType
878 return suggestedFilenameWithMIMEType(self, MIMEType);
883 @implementation NSString (WebNSURLExtras)
885 - (BOOL)_web_isUserVisibleURL
890 char static_buffer[1024];
892 BOOL success = CFStringGetCString((CFStringRef)self, static_buffer, 1023, kCFStringEncodingUTF8);
896 p = [self UTF8String];
899 int length = strlen(p);
901 // check for characters <= 0x20 or >=0x7f, %-escape sequences of %7f, and xn--, these
902 // are the things that will lead _web_userVisibleString to actually change things.
904 for (i = 0; i < length; i++) {
905 unsigned char c = p[i];
906 // escape control characters, space, and delete
907 if (c <= 0x20 || c == 0x7f) {
910 } else if (c == '%' && (i + 1 < length && isHexDigit(p[i + 1])) && i + 2 < length && isHexDigit(p[i + 2])) {
911 unsigned char u = (hexDigitValue(p[i + 1]) << 4) | hexDigitValue(p[i + 2]);
918 // Check for "xn--" in an efficient, non-case-sensitive, way.
919 if (c == '-' && i >= 3 && (p[i - 3] | 0x20) == 'x' && (p[i - 2] | 0x20) == 'n' && p[i - 1] == '-') {
930 - (BOOL)_webkit_isJavaScriptURL
932 return [self _webkit_hasCaseInsensitivePrefix:@"javascript:"];
935 - (BOOL)_webkit_isFileURL
937 return [self rangeOfString:@"file:" options:(NSCaseInsensitiveSearch | NSAnchoredSearch)].location != NSNotFound;
940 - (NSString *)_webkit_stringByReplacingValidPercentEscapes
942 return decodeURLEscapeSequences(self);
945 - (NSString *)_webkit_scriptIfJavaScriptURL
947 if (![self _webkit_isJavaScriptURL]) {
950 return [[self substringFromIndex:11] _webkit_stringByReplacingValidPercentEscapes];
953 - (BOOL)_webkit_isFTPDirectoryURL
955 int length = [self length];
956 if (length < 5) { // 5 is length of "ftp:/"
959 unichar lastChar = [self characterAtIndex:length - 1];
960 return lastChar == '/' && [self _webkit_hasCaseInsensitivePrefix:@"ftp:"];
964 static BOOL readIDNScriptWhiteListFile(NSString *filename)
969 FILE *file = fopen([filename fileSystemRepresentation], "r");
974 // Read a word at a time.
975 // Allow comments, starting with # character to the end of the line.
977 // Skip a comment if present.
978 int result = fscanf(file, " #%*[^\n\r]%*[\n\r]");
983 // Read a script name if present.
985 result = fscanf(file, " %32[^# \t\n\r]%*[^# \t\n\r] ", word);
990 // Got a word, map to script code and put it into the array.
991 int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
992 if (script >= 0 && script < USCRIPT_CODE_LIMIT) {
993 size_t index = script / 32;
994 uint32_t mask = 1 << (script % 32);
995 IDNScriptWhiteList[index] |= mask;
1003 static void readIDNScriptWhiteList(void)
1005 // Read white list from library.
1006 NSArray *dirs = NSSearchPathForDirectoriesInDomains(NSLibraryDirectory, NSAllDomainsMask, YES);
1007 int i, numDirs = [dirs count];
1008 for (i = 0; i < numDirs; i++) {
1009 NSString *dir = [dirs objectAtIndex:i];
1010 if (readIDNScriptWhiteListFile([dir stringByAppendingPathComponent:@"IDNScriptWhiteList.txt"])) {
1015 // Fall back on white list inside bundle.
1016 NSBundle *bundle = [NSBundle bundleWithIdentifier:@"com.apple.WebKit"];
1017 readIDNScriptWhiteListFile([bundle pathForResource:@"IDNScriptWhiteList" ofType:@"txt"]);
1020 static BOOL allCharactersInIDNScriptWhiteList(const UChar *buffer, int32_t length)
1022 pthread_once(&IDNScriptWhiteListFileRead, readIDNScriptWhiteList);
1025 while (i < length) {
1027 U16_NEXT(buffer, i, length, c)
1028 UErrorCode error = U_ZERO_ERROR;
1029 UScriptCode script = uscript_getScript(c, &error);
1030 if (error != U_ZERO_ERROR) {
1031 LOG_ERROR("got ICU error while trying to look at scripts: %d", error);
1035 LOG_ERROR("got negative number for script code from ICU: %d", script);
1038 if (script >= USCRIPT_CODE_LIMIT) {
1041 size_t index = script / 32;
1042 uint32_t mask = 1 << (script % 32);
1043 if (!(IDNScriptWhiteList[index] & mask)) {
1047 if (isLookalikeCharacter(c))
1053 static BOOL allCharactersAllowedByTLDRules(const UChar* buffer, int32_t length)
1055 // Skip trailing dot for root domain.
1056 if (buffer[length - 1] == '.')
1060 && buffer[length - 3] == '.'
1061 && buffer[length - 2] == 0x0440 // CYRILLIC SMALL LETTER ER
1062 && buffer[length - 1] == 0x0444) // CYRILLIC SMALL LETTER EF
1064 // Rules defined by <http://www.cctld.ru/ru/docs/rulesrf.php>. This code only checks requirements that matter for presentation purposes.
1065 for (int32_t i = length - 4; i; --i) {
1066 UChar ch = buffer[i];
1068 // Only modern Russian letters, digits and dashes are allowed.
1069 if ((ch >= 0x0430 && ch <= 0x044f)
1071 || (ch >= '0' && ch <= '9')
1075 // Only check top level domain. Lower level registrars may have different rules.
1084 // Not a known top level domain with special rules.
1088 // Return value of nil means no mapping is necessary.
1089 // If makeString is NO, then return value is either nil or self to indicate mapping is necessary.
1090 // If makeString is YES, then return value is either nil or the mapped string.
1091 - (NSString *)_web_mapHostNameWithRange:(NSRange)range encode:(BOOL)encode makeString:(BOOL)makeString
1093 if (range.length > HOST_NAME_BUFFER_LENGTH) {
1097 if ([self length] == 0)
1100 UChar sourceBuffer[HOST_NAME_BUFFER_LENGTH];
1101 UChar destinationBuffer[HOST_NAME_BUFFER_LENGTH];
1103 NSString *string = self;
1104 if (encode && [self rangeOfString:@"%" options:NSLiteralSearch range:range].location != NSNotFound) {
1105 NSString *substring = [self substringWithRange:range];
1106 substring = WebCFAutorelease(CFURLCreateStringByReplacingPercentEscapes(NULL, (CFStringRef)substring, CFSTR("")));
1107 if (substring != nil) {
1109 range = NSMakeRange(0, [string length]);
1113 int length = range.length;
1114 [string getCharacters:sourceBuffer range:range];
1116 UErrorCode error = U_ZERO_ERROR;
1117 int32_t numCharactersConverted = (encode ? uidna_IDNToASCII : uidna_IDNToUnicode)
1118 (sourceBuffer, length, destinationBuffer, HOST_NAME_BUFFER_LENGTH, UIDNA_ALLOW_UNASSIGNED, NULL, &error);
1119 if (error != U_ZERO_ERROR) {
1122 if (numCharactersConverted == length && memcmp(sourceBuffer, destinationBuffer, length * sizeof(UChar)) == 0) {
1125 if (!encode && !allCharactersInIDNScriptWhiteList(destinationBuffer, numCharactersConverted) && !allCharactersAllowedByTLDRules(destinationBuffer, numCharactersConverted)) {
1128 return makeString ? (NSString *)[NSString stringWithCharacters:destinationBuffer length:numCharactersConverted] : (NSString *)self;
1131 - (BOOL)_web_hostNameNeedsDecodingWithRange:(NSRange)range
1133 return [self _web_mapHostNameWithRange:range encode:NO makeString:NO] != nil;
1136 - (BOOL)_web_hostNameNeedsEncodingWithRange:(NSRange)range
1138 return [self _web_mapHostNameWithRange:range encode:YES makeString:NO] != nil;
1141 - (NSString *)_web_decodeHostNameWithRange:(NSRange)range
1143 return [self _web_mapHostNameWithRange:range encode:NO makeString:YES];
1146 - (NSString *)_web_encodeHostNameWithRange:(NSRange)range
1148 return [self _web_mapHostNameWithRange:range encode:YES makeString:YES];
1151 - (NSString *)_web_decodeHostName
1153 NSString *name = [self _web_mapHostNameWithRange:NSMakeRange(0, [self length]) encode:NO makeString:YES];
1154 return name == nil ? self : name;
1157 - (NSString *)_web_encodeHostName
1159 NSString *name = [self _web_mapHostNameWithRange:NSMakeRange(0, [self length]) encode:YES makeString:YES];
1160 return name == nil ? self : name;
1163 -(NSRange)_webkit_rangeOfURLScheme
1165 NSRange colon = [self rangeOfString:@":"];
1166 if (colon.location != NSNotFound && colon.location > 0) {
1167 NSRange scheme = {0, colon.location};
1168 static NSCharacterSet *InverseSchemeCharacterSet = nil;
1169 if (!InverseSchemeCharacterSet) {
1171 This stuff is very expensive. 10-15 msec on a 2x1.2GHz. If not cached it swamps
1172 everything else when adding items to the autocomplete DB. Makes me wonder if we
1173 even need to enforce the character set here.
1175 NSString *acceptableCharacters = @"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-";
1176 InverseSchemeCharacterSet = [[[NSCharacterSet characterSetWithCharactersInString:acceptableCharacters] invertedSet] retain];
1178 NSRange illegals = [self rangeOfCharacterFromSet:InverseSchemeCharacterSet options:0 range:scheme];
1179 if (illegals.location == NSNotFound)
1182 return NSMakeRange(NSNotFound, 0);
1185 -(BOOL)_webkit_looksLikeAbsoluteURL
1187 // Trim whitespace because _web_URLWithString allows whitespace.
1188 return [[self _webkit_stringByTrimmingWhitespace] _webkit_rangeOfURLScheme].location != NSNotFound;
1191 - (NSString *)_webkit_URLFragment
1193 NSRange fragmentRange;
1195 fragmentRange = [self rangeOfString:@"#" options:NSLiteralSearch];
1196 if (fragmentRange.location == NSNotFound)
1198 return [self substringFromIndex:fragmentRange.location + 1];