439 lines
8.2 KiB
C++
439 lines
8.2 KiB
C++
/*++
|
|
|
|
Copyright (c) 1996 Microsoft Corporation
|
|
|
|
Module Name :
|
|
|
|
linkpars.cpp
|
|
|
|
Abstract:
|
|
|
|
Link parser class implementation. This class responsible for
|
|
parsing the html file for hyperlink.
|
|
|
|
Author:
|
|
|
|
Michael Cheuk (mcheuk)
|
|
|
|
Project:
|
|
|
|
Link Checker
|
|
|
|
Revision History:
|
|
|
|
--*/
|
|
|
|
#include "stdafx.h"
|
|
#include "LinkPars.h"
|
|
|
|
#include "link.h"
|
|
#include "lcmgr.h"
|
|
|
|
#ifdef _DEBUG
|
|
#define new DEBUG_NEW
|
|
#undef THIS_FILE
|
|
static char THIS_FILE[] = __FILE__;
|
|
#endif
|
|
|
|
// Constants
|
|
const CString strLocalHost_c(_T("localhost"));
|
|
|
|
void
|
|
CLinkParser::Parse(
|
|
const CString& strData,
|
|
const CString& strBaseURL,
|
|
CLinkPtrList& rLinkPtrList
|
|
)
|
|
/*++
|
|
|
|
Routine Description:
|
|
|
|
Parse a page of html data
|
|
|
|
Arguments:
|
|
|
|
strData - page of html
|
|
strBaseURL - base URL
|
|
rLinkPtrList - reference to links list. The new links will
|
|
will be added to this list.
|
|
|
|
Return Value:
|
|
|
|
N/A
|
|
|
|
--*/
|
|
{
|
|
// Look for the first '<'
|
|
LPCTSTR lpszOpen = _tcschr(strData, _TUCHAR('<'));
|
|
|
|
while(lpszOpen != NULL)
|
|
{
|
|
// Look for the '>'
|
|
LPCTSTR lpszClose = _tcschr(lpszOpen, _TUCHAR('>'));
|
|
if(lpszClose)
|
|
{
|
|
// The possible tag must be longer than 7 bytes (a href=)
|
|
int iCount = (int)(lpszClose - lpszOpen) - 1; // skip the '<'
|
|
if( iCount > 7 )
|
|
{
|
|
int iIndex = lpszOpen - ((LPCTSTR)strData) + 1; // skip the '<'
|
|
|
|
CString strPossibleURL(strData.Mid(iIndex, iCount));
|
|
|
|
// Parse the possible tag
|
|
if(ParsePossibleTag(strPossibleURL))
|
|
{
|
|
CString strURL;
|
|
BOOL fLocalLink;
|
|
|
|
// We found a valid tag. Time to create new link.
|
|
if( CreateURL(strPossibleURL, strBaseURL, strURL, fLocalLink) )
|
|
{
|
|
rLinkPtrList.AddLink(strURL, strBaseURL, strPossibleURL, fLocalLink);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Look for the next '<'
|
|
lpszOpen = _tcschr(++lpszOpen, _TUCHAR('<'));
|
|
}
|
|
|
|
} // CLinkParser::Parse
|
|
|
|
|
|
BOOL
|
|
CLinkParser::ParsePossibleTag(
|
|
CString& strTag
|
|
)
|
|
/*++
|
|
|
|
Routine Description:
|
|
|
|
Parse a single "<.....>" for possible hyperlink
|
|
|
|
Arguments:
|
|
|
|
strTag - value inside a "<.....>" excluding '<' & '>'
|
|
If this is a hyperlink tag, the hyperlink URL
|
|
will be put in strTag.
|
|
|
|
Return Value:
|
|
|
|
BOOL - TRUE if hyperlink tag. FALSE otherwise.
|
|
|
|
--*/
|
|
{
|
|
// Make a working copy
|
|
CString strWorkCopy(strTag);
|
|
|
|
// Let's work with lower case
|
|
strWorkCopy.MakeLower();
|
|
|
|
//
|
|
// Check for,
|
|
//
|
|
// HyperLink:
|
|
// <a href="url" ...>
|
|
// <a href="url#anchor" ...>
|
|
// <a href="#anchor" ...>
|
|
//
|
|
// CGI
|
|
// <a href="url?parameters" ...>
|
|
//
|
|
// Style Sheet
|
|
// <link rel="stylesheet" href="url" ...>
|
|
//
|
|
if( strWorkCopy[0] == _T('a') ||
|
|
strWorkCopy.Find(_T("link")) == 0 )
|
|
{
|
|
return GetTagValue(strTag, CString(_T("href")));
|
|
}
|
|
|
|
//
|
|
// Check for,
|
|
//
|
|
// <body background="url" ...>
|
|
//
|
|
// Table:
|
|
// <table background="url" ...>
|
|
// <th background="url" ...>
|
|
// <td background="url" ...>
|
|
//
|
|
else if( strWorkCopy.Find(_T("body")) == 0 ||
|
|
strWorkCopy.Find(_T("table")) == 0 ||
|
|
strWorkCopy.Find(_T("th")) == 0 ||
|
|
strWorkCopy.Find(_T("td")) == 0 )
|
|
{
|
|
return GetTagValue(strTag, CString(_T("background")));
|
|
}
|
|
|
|
//
|
|
// Check for,
|
|
//
|
|
// Sound:
|
|
// <bgsound src="url" ...>
|
|
// <sound src="url" ...>
|
|
//
|
|
// Frame:
|
|
// <frame src="url" ...>
|
|
//
|
|
// Netscape embeded:
|
|
// <embed src="url" ...>
|
|
//
|
|
// JavaScript & VB Script
|
|
// <script src="url" language="java or vbs" ...>
|
|
//
|
|
else if( strWorkCopy.Find(_T("bgsound")) == 0 ||
|
|
strWorkCopy.Find(_T("sound")) == 0 ||
|
|
strWorkCopy.Find(_T("frame")) == 0 ||
|
|
strWorkCopy.Find(_T("embed")) == 0 ||
|
|
strWorkCopy.Find(_T("script")) == 0 )
|
|
{
|
|
return GetTagValue(strTag, CString(_T("src")));
|
|
}
|
|
|
|
// Check for,
|
|
//
|
|
// Image:
|
|
// <img src="url" ...>
|
|
//
|
|
// Video:
|
|
// <img dynsrc="url">
|
|
//
|
|
// VRML:
|
|
// <img vrml="url">
|
|
//
|
|
else if( strWorkCopy.Find(_T("img")) == 0 )
|
|
{
|
|
if(GetTagValue(strTag, CString(_T("src"))))
|
|
{
|
|
return TRUE;
|
|
}
|
|
|
|
if(GetTagValue(strTag, CString(_T("dynsrc"))))
|
|
{
|
|
return TRUE;
|
|
}
|
|
|
|
return GetTagValue(strTag, CString(_T("vrml")));
|
|
}
|
|
|
|
// Java
|
|
// <applet code="name.class" codebase="url" ...>
|
|
else if( strWorkCopy.Find(_T("applet")) == 0 )
|
|
{
|
|
return GetTagValue(strTag, CString(_T("codebase")));
|
|
}
|
|
|
|
// Form
|
|
// <form action="url" ...>
|
|
else if( strWorkCopy.Find(_T("form")) == 0 )
|
|
{
|
|
return GetTagValue(strTag, CString(_T("action")));
|
|
}
|
|
|
|
return FALSE;
|
|
|
|
} // CLinkParser::ParsePossibleTag
|
|
|
|
|
|
BOOL
|
|
CLinkParser::GetTagValue(
|
|
CString& strTag,
|
|
const CString& strParam
|
|
)
|
|
/*++
|
|
|
|
Routine Description:
|
|
|
|
Get the hyperlink value from "<.....>"
|
|
|
|
Arguments:
|
|
|
|
strTag - value inside a "<.....>" excluding '<' & '>'
|
|
If this is a hyperlink tag, the hyperlink URL
|
|
will be put in strTag.
|
|
|
|
strParam - parameter to look for. For example, src or href
|
|
|
|
Return Value:
|
|
|
|
BOOL - TRUE if hyperlink tag. FALSE otherwise.
|
|
|
|
--*/
|
|
{
|
|
// Make a copy of original tag
|
|
CString strWorkCopy(strTag);
|
|
strWorkCopy.MakeLower();
|
|
|
|
int iLength = strParam.GetLength();
|
|
|
|
// Look for the parameter
|
|
int iIndex = strWorkCopy.Find(strParam);
|
|
if(iIndex == -1)
|
|
{
|
|
return FALSE;
|
|
}
|
|
|
|
// Remove the parameter from the tag
|
|
CString strResult( strTag.Mid(iIndex + iLength) );
|
|
|
|
// Look for '='
|
|
iIndex = strResult.Find(_T("="));
|
|
if(iIndex == -1)
|
|
{
|
|
return FALSE;
|
|
}
|
|
|
|
// Remove the '=' from the tag
|
|
strResult = strResult.Mid(iIndex+1);
|
|
|
|
// Look for the value
|
|
int iStart = -1;
|
|
int iEnd = -1;
|
|
int fPara = FALSE; // is the tag start with "
|
|
|
|
// Search for the value
|
|
for(int i=0; i<strResult.GetLength(); i++)
|
|
{
|
|
// If we found the starting index of value, look
|
|
// for the end of the value
|
|
if(iStart!=-1 &&
|
|
( !fPara && strResult[i] == _TCHAR(' ') ||
|
|
( fPara && strResult[i] == _TCHAR('\"') )
|
|
)
|
|
)
|
|
{
|
|
iEnd = i;
|
|
break;
|
|
}
|
|
|
|
// Look for the starting index of value
|
|
if(iStart==-1 && strResult[i] != _TCHAR(' ') && strResult[i] != _TCHAR('\"') )
|
|
{
|
|
iStart = i;
|
|
if(i - 1 >= 0)
|
|
{
|
|
fPara = (strResult[i-1] == _TCHAR('\"')); // found a "
|
|
}
|
|
}
|
|
}
|
|
|
|
// Found the starting index
|
|
if(iStart != -1)
|
|
{
|
|
// If we didn't find the end of value, use the
|
|
// last character as end
|
|
if(iEnd == -1)
|
|
{
|
|
iEnd = strResult.GetLength();
|
|
}
|
|
|
|
// Copy the value to the input
|
|
strTag = strResult.Mid(iStart, (iEnd - iStart));
|
|
|
|
// Change '\' to '/'
|
|
CLinkCheckerMgr::ChangeBackSlash(strTag);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
|
|
} // CLinkParser::GetTagValue
|
|
|
|
|
|
BOOL
|
|
CLinkParser::CreateURL(
|
|
const CString& strRelativeURL,
|
|
const CString& strBaseURL,
|
|
CString& strURL,
|
|
BOOL& fLocalLink
|
|
)
|
|
/*++
|
|
|
|
Routine Description:
|
|
|
|
Create a URL from base URL & relative URL. It also check
|
|
the result for local or remote link
|
|
|
|
Arguments:
|
|
|
|
strRelativeURL - relative URL
|
|
strBaseURL - base URL
|
|
strURL - result URL
|
|
fLocalLink - will be set to TRUE if this is a local link
|
|
|
|
Return Value:
|
|
|
|
BOOL - TRUE if sucess. FALSE otherwise.
|
|
|
|
--*/
|
|
{
|
|
ASSERT(CWininet::IsLoaded());
|
|
|
|
// Remove the anchor from the relative URL
|
|
CString strNewRelativeURL(strRelativeURL);
|
|
int i = strNewRelativeURL.ReverseFind(_TCHAR('#'));
|
|
if(i != -1)
|
|
{
|
|
strNewRelativeURL = strNewRelativeURL.Left(i);
|
|
}
|
|
|
|
// Combine the URLs
|
|
DWORD dwLength = INTERNET_MAX_URL_LENGTH;
|
|
LPTSTR lpBuffer = strURL.GetBuffer(dwLength);
|
|
|
|
CWininet::InternetCombineUrlA(
|
|
strBaseURL,
|
|
strNewRelativeURL,
|
|
lpBuffer,
|
|
&dwLength,
|
|
ICU_ENCODE_SPACES_ONLY);
|
|
|
|
strURL.ReleaseBuffer();
|
|
|
|
// Check for local or remote link
|
|
URL_COMPONENTS urlcomp;
|
|
|
|
memset(&urlcomp, 0, sizeof(urlcomp));
|
|
urlcomp.dwStructSize = sizeof(urlcomp);
|
|
urlcomp.dwHostNameLength = 1;
|
|
|
|
VERIFY(CWininet::InternetCrackUrlA(strURL, strURL.GetLength(), NULL, &urlcomp));
|
|
|
|
// Check for possible local link
|
|
if((int)urlcomp.dwHostNameLength == m_strLocalHostName.GetLength() ||
|
|
(int)urlcomp.dwHostNameLength == strLocalHost_c.GetLength()) // localhost
|
|
{
|
|
if( _tcsnccmp( urlcomp.lpszHostName, m_strLocalHostName, m_strLocalHostName.GetLength() ) == 0 ||
|
|
_tcsnccmp( urlcomp.lpszHostName, strLocalHost_c, strLocalHost_c.GetLength() ) == 0)
|
|
{
|
|
fLocalLink = TRUE;
|
|
|
|
// Local link
|
|
if(GetLinkCheckerMgr().GetUserOptions().IsCheckLocalLinks())
|
|
{
|
|
return TRUE;
|
|
}
|
|
else
|
|
{
|
|
return FALSE;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Remote link
|
|
fLocalLink = FALSE;
|
|
if(GetLinkCheckerMgr().GetUserOptions().IsCheckRemoteLinks())
|
|
{
|
|
return TRUE;
|
|
}
|
|
else
|
|
{
|
|
return FALSE;
|
|
}
|
|
|
|
} // CLinkParser::CreateURL
|