2 * Copyright (C) 2009 Google Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 #include "WebPageSerializer.h"
34 #include "DocumentLoader.h"
37 #include "HTMLAllCollection.h"
38 #include "HTMLFrameOwnerElement.h"
39 #include "HTMLInputElement.h"
40 #include "HTMLNames.h"
42 #include "MHTMLArchive.h"
43 #include "PageSerializer.h"
46 #include "WebCString.h"
48 #include "WebFrameImpl.h"
49 #include "WebPageSerializerClient.h"
50 #include "WebPageSerializerImpl.h"
51 #include "WebString.h"
53 #include "WebVector.h"
55 #include "WebViewImpl.h"
57 #include <wtf/text/StringConcatenate.h>
59 using namespace WebCore;
63 KURL getSubResourceURLFromElement(Element* element)
66 const QualifiedName* attributeName = 0;
67 if (element->hasTagName(HTMLNames::imgTag) || element->hasTagName(HTMLNames::scriptTag))
68 attributeName = &HTMLNames::srcAttr;
69 else if (element->hasTagName(HTMLNames::inputTag)) {
70 HTMLInputElement* input = static_cast<HTMLInputElement*>(element);
71 if (input->isImageButton())
72 attributeName = &HTMLNames::srcAttr;
73 } else if (element->hasTagName(HTMLNames::bodyTag)
74 || element->hasTagName(HTMLNames::tableTag)
75 || element->hasTagName(HTMLNames::trTag)
76 || element->hasTagName(HTMLNames::tdTag))
77 attributeName = &HTMLNames::backgroundAttr;
78 else if (element->hasTagName(HTMLNames::blockquoteTag)
79 || element->hasTagName(HTMLNames::qTag)
80 || element->hasTagName(HTMLNames::delTag)
81 || element->hasTagName(HTMLNames::insTag))
82 attributeName = &HTMLNames::citeAttr;
83 else if (element->hasTagName(HTMLNames::linkTag)) {
84 // If the link element is not css, ignore it.
85 if (equalIgnoringCase(element->getAttribute(HTMLNames::typeAttr), "text/css")) {
86 // FIXME: Add support for extracting links of sub-resources which
87 // are inside style-sheet such as @import, @font-face, url(), etc.
88 attributeName = &HTMLNames::hrefAttr;
90 } else if (element->hasTagName(HTMLNames::objectTag))
91 attributeName = &HTMLNames::dataAttr;
92 else if (element->hasTagName(HTMLNames::embedTag))
93 attributeName = &HTMLNames::srcAttr;
98 String value = element->getAttribute(*attributeName);
99 // Ignore javascript content.
100 if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false))
103 return element->document()->completeURL(value);
106 void retrieveResourcesForElement(Element* element,
107 Vector<Frame*>* visitedFrames,
108 Vector<Frame*>* framesToVisit,
109 Vector<KURL>* frameURLs,
110 Vector<KURL>* resourceURLs)
112 // If the node is a frame, we'll process it later in retrieveResourcesForFrame.
113 if ((element->hasTagName(HTMLNames::iframeTag) || element->hasTagName(HTMLNames::frameTag)
114 || element->hasTagName(HTMLNames::objectTag) || element->hasTagName(HTMLNames::embedTag))
115 && element->isFrameOwnerElement()) {
116 Frame* frame = static_cast<HTMLFrameOwnerElement*>(element)->contentFrame();
118 if (!visitedFrames->contains(frame))
119 framesToVisit->append(frame);
124 KURL url = getSubResourceURLFromElement(element);
125 if (url.isEmpty() || !url.isValid())
126 return; // No subresource for this node.
128 // Ignore URLs that have a non-standard protocols. Since the FTP protocol
129 // does no have a cache mechanism, we skip it as well.
130 if (!url.protocolInHTTPFamily() && !url.isLocalFile())
133 if (!resourceURLs->contains(url))
134 resourceURLs->append(url);
137 void retrieveResourcesForFrame(Frame* frame,
138 const WebKit::WebVector<WebKit::WebCString>& supportedSchemes,
139 Vector<Frame*>* visitedFrames,
140 Vector<Frame*>* framesToVisit,
141 Vector<KURL>* frameURLs,
142 Vector<KURL>* resourceURLs)
144 KURL frameURL = frame->loader()->documentLoader()->request().url();
146 // If the frame's URL is invalid, ignore it, it is not retrievable.
147 if (!frameURL.isValid())
150 // Ignore frames from unsupported schemes.
151 bool isValidScheme = false;
152 for (size_t i = 0; i < supportedSchemes.size(); ++i) {
153 if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) {
154 isValidScheme = true;
161 // If we have already seen that frame, ignore it.
162 if (visitedFrames->contains(frame))
164 visitedFrames->append(frame);
165 if (!frameURLs->contains(frameURL))
166 frameURLs->append(frameURL);
168 // Now get the resources associated with each node of the document.
169 RefPtr<HTMLAllCollection> allNodes = frame->document()->all();
170 for (unsigned i = 0; i < allNodes->length(); ++i) {
171 Node* node = allNodes->item(i);
172 // We are only interested in HTML resources.
173 if (!node->isElementNode())
175 retrieveResourcesForElement(static_cast<Element*>(node),
176 visitedFrames, framesToVisit,
177 frameURLs, resourceURLs);
185 void WebPageSerializer::serialize(WebView* view, WebVector<WebPageSerializer::Resource>* resourcesParam)
187 Vector<PageSerializer::Resource> resources;
188 PageSerializer serializer(&resources);
189 serializer.serialize(static_cast<WebViewImpl*>(view)->page());
191 Vector<Resource> result;
192 for (Vector<PageSerializer::Resource>::const_iterator iter = resources.begin(); iter != resources.end(); ++iter) {
194 resource.url = iter->url;
195 resource.mimeType = iter->mimeType.ascii();
196 // FIXME: we are copying all the resource data here. Idealy we would have a WebSharedData().
197 resource.data = WebCString(iter->data->data(), iter->data->size());
198 result.append(resource);
201 *resourcesParam = result;
204 WebCString WebPageSerializer::serializeToMHTML(WebView* view)
206 RefPtr<SharedBuffer> mhtml = MHTMLArchive::generateMHTMLData(static_cast<WebViewImpl*>(view)->page());
207 // FIXME: we are copying all the data here. Idealy we would have a WebSharedData().
208 return WebCString(mhtml->data(), mhtml->size());
211 WebCString WebPageSerializer::serializeToMHTMLUsingBinaryEncoding(WebView* view)
213 RefPtr<SharedBuffer> mhtml = MHTMLArchive::generateMHTMLDataUsingBinaryEncoding(static_cast<WebViewImpl*>(view)->page());
214 // FIXME: we are copying all the data here. Idealy we would have a WebSharedData().
215 return WebCString(mhtml->data(), mhtml->size());
218 bool WebPageSerializer::serialize(WebFrame* frame,
220 WebPageSerializerClient* client,
221 const WebVector<WebURL>& links,
222 const WebVector<WebString>& localPaths,
223 const WebString& localDirectoryName)
225 WebPageSerializerImpl serializerImpl(
226 frame, recursive, client, links, localPaths, localDirectoryName);
227 return serializerImpl.serialize();
230 bool WebPageSerializer::retrieveAllResources(WebView* view,
231 const WebVector<WebCString>& supportedSchemes,
232 WebVector<WebURL>* resourceURLs,
233 WebVector<WebURL>* frameURLs) {
234 WebFrameImpl* mainFrame = static_cast<WebFrameImpl*>(view->mainFrame());
238 Vector<Frame*> framesToVisit;
239 Vector<Frame*> visitedFrames;
240 Vector<KURL> frameKURLs;
241 Vector<KURL> resourceKURLs;
243 // Let's retrieve the resources from every frame in this page.
244 framesToVisit.append(mainFrame->frame());
245 while (!framesToVisit.isEmpty()) {
246 Frame* frame = framesToVisit[0];
247 framesToVisit.remove(0);
248 retrieveResourcesForFrame(frame, supportedSchemes,
249 &visitedFrames, &framesToVisit,
250 &frameKURLs, &resourceKURLs);
253 // Converts the results to WebURLs.
254 WebVector<WebURL> resultResourceURLs(resourceKURLs.size());
255 for (size_t i = 0; i < resourceKURLs.size(); ++i) {
256 resultResourceURLs[i] = resourceKURLs[i];
257 // A frame's src can point to the same URL as another resource, keep the
258 // resource URL only in such cases.
259 size_t index = frameKURLs.find(resourceKURLs[i]);
260 if (index != notFound)
261 frameKURLs.remove(index);
263 *resourceURLs = resultResourceURLs;
264 WebVector<WebURL> resultFrameURLs(frameKURLs.size());
265 for (size_t i = 0; i < frameKURLs.size(); ++i)
266 resultFrameURLs[i] = frameKURLs[i];
267 *frameURLs = resultFrameURLs;
272 WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset)
274 return makeString("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=", static_cast<const String&>(charset), "\">");
277 WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url)
279 return String::format("\n<!-- saved from url=(%04d)%s -->\n",
280 static_cast<int>(url.spec().length()),
284 WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget)
286 if (baseTarget.isEmpty())
287 return makeString("<base href=\".\">");
288 return makeString("<base href=\".\" target=\"", static_cast<const String&>(baseTarget), "\">");
291 } // namespace WebKit