A
download codepage_detect.cpp
Language: C++
LOC: 211
Project Info
WinMerge
Server: SourceForge
Type: cvs
...erge\winmerge\WinMerge\Src\
   7zCommon.cpp
   7zCommon.h
   AboutDlg.cpp
   AboutDlg.h
   AppSerialize.cpp
   AppSerialize.h
   BCMenu.cpp
   BCMenu.h
   ByteComparator.cpp
   ByteComparator.h
   CCPrompt.cpp
   CCPrompt.h
   charsets.c
   charsets.h
   ChildFrm.cpp
   ChildFrm.h
   codepage.cpp
   codepage.h
   codepage_detect.cpp
   codepage_detect.h
   ColorButton.cpp
   ColorButton.h
   CompareOptions.h
   CompareStatisticsDlg.cpp
   CompareStatisticsDlg.h
   CompareStats.cpp
   CompareStats.h
   ConfigLog.cpp
   ConfigLog.h
   Diff.cpp
   DiffContext.cpp
   DiffContext.h
   DiffFileData.cpp
   DiffFileData.h
   DiffFileInfo.cpp
   DiffFileInfo.h
   DiffItem.cpp
   DiffItem.h
   DiffItemList.cpp
   DiffItemList.h
   DiffList.cpp
   DiffList.h
   diffmain.c
   DiffThread.cpp
   DiffThread.h
   DiffViewBar.cpp
   DiffViewBar.h
   DiffWrapper.cpp
   DiffWrapper.h
   Dir.cpp
   DirActions.cpp
   DirCmpReport.cpp
   DirCmpReport.h
   DirCmpReportDlg.cpp
   DirCmpReportDlg.h
   DirColsDlg.cpp
   DirColsDlg.h
   DirCompProgressDlg.cpp
   DirCompProgressDlg.h
   DirDoc.cpp
   DirDoc.h
   DirFrame.cpp
   DirFrame.h
   DirReportTypes.h
   DirScan.cpp
   DirScan.h
   DirView.cpp
   DirView.h
   DirViewColHandler.cpp
   DirViewColItems.cpp
   DirViewColItems.h
   dlgutil.cpp
   dlgutil.h
   dllpstub.cpp
   dllpstub.h
   dllver.cpp
   dllver.h
   EditorFilepathBar.cpp
   EditorFilepathBar.h
   Exceptions.h
   FileActionScript.cpp
   FileActionScript.h
   FileFilterHelper.cpp
   FileFilterHelper.h
   FileFilterMgr.cpp
   FileFilterMgr.h
   FileFiltersDlg.cpp
   FileFiltersDlg.h
   FileInfo.cpp
   FileInfo.h
   FileLocation.h
   FilepathEdit.cpp
   FilepathEdit.h
   files.cpp
   files.h
   FileTextEncoding.cpp
   FileTextEncoding.h
   FileTextStats.h
   FileTransform.cpp
   FileTransform.h
   GhostTextBuffer.cpp
   GhostTextBuffer.h
   GhostTextView.cpp
   GhostTextView.h
   IAbortable.h
   IntToIntMap.h
   IOptionsPanel.h
   LoadSaveCodepageDlg.cpp
   LoadSaveCodepageDlg.h
   locality.cpp
   locality.h
   LocationBar.cpp
   LocationBar.h
   LocationView.cpp
   LocationView.h
   MainFrm.cpp
   MainFrm.h
   markdown.cpp
   markdown.h
   Merge.cpp
   Merge.dsp
   Merge.h
   Merge.rc
   MergeArgs.cpp
   MergeDiffDetailView.cpp
   MergeDiffDetailView.h
   MergeDoc.cpp
   MergeDoc.h
   MergeDocEncoding.cpp
   MergeDocLineDiffs.cpp
   MergeEditStatus.h
   MergeEditView.cpp
   MergeEditView.h
   MergeLineFlags.h
   MovedBlocks.cpp
   OpenDlg.cpp
   OpenDlg.h
   OptionsDef.h
   OptionsInit.cpp
   OutputDlg.cpp
   OutputDlg.h
   PatchDlg.cpp
   PatchDlg.h
   PatchTool.cpp
   PatchTool.h
   PathContext.cpp
   PathContext.h
   paths.cpp
   paths.h
   PluginManager.cpp
   PluginManager.h
   Plugins.cpp
   Plugins.h
   ProjectFile.cpp
   ProjectFile.h
   ProjectFilePathsDlg.cpp
   ProjectFilePathsDlg.h
   PropArchive.cpp
   PropArchive.h
   PropCodepage.cpp
   PropCodepage.h
   PropColors.cpp
   PropColors.h
   PropCompare.cpp
   PropCompare.h
   PropEditor.cpp
   PropEditor.h
   PropGeneral.cpp
   PropGeneral.h
   PropLineFilter.cpp
   PropLineFilter.h
   PropRegistry.cpp
   PropRegistry.h
   PropSyntaxColors.cpp
   PropSyntaxColors.h
   PropTextColors.cpp
   PropTextColors.h
   PropVss.cpp
   PropVss.h
   resource.h
   SaveClosingDlg.cpp
   SaveClosingDlg.h
   SelectUnpackerDlg.cpp
   SelectUnpackerDlg.h
   SetResourceVersions.bat
   SetResourceVersions.wsf
   SharedFilterDlg.cpp
   SharedFilterDlg.h
   SourceControl.cpp
   Splash.cpp
   Splash.h
   ssapi.cpp
   ssapi.h
   ssauto.h
   STACK.C
   StdAfx.cpp
   StdAfx.h
   stringdiffs.cpp
   stringdiffs.h
   stringdiffsi.h
   SyntaxColors.cpp
   SyntaxColors.h
   TestFilterDlg.cpp
   TestFilterDlg.h
   ViewableWhitespace.cpp
   ViewableWhitespace.h
   VSSHelper.cpp
   VSSHelper.h
   VssPrompt.cpp
   VssPrompt.h
   WaitStatusCursor.cpp
   WaitStatusCursor.h
   winnt_supp.h
   WMGotoDlg.cpp
   WMGotoDlg.h
   XmlDoc.cpp
   XmlDoc.h

/** 
 * @file  codepage_detect.cpp
 *
 * @brief Deducing codepage from file contents, when we can
 *
 */
// RCS ID line follows -- this is updated by CVS
// $Id: codepage_detect.cpp,v 1.10 2006/02/13 03:04:48 elsapo Exp $

#include "StdAfx.h"
#include <shlwapi.h>
#include "codepage_detect.h"
#include "unicoder.h"
#include "codepage.h"
#include "charsets.h"
#include "markdown.h"
#include "FileTextEncoding.h"

#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif

/**
 * @brief Prefixes to handle when searching for codepage names
 */
static LPCTSTR f_wincp_prefixes[] =
{
	_T("WINDOWS-")
	, _T("WINDOWS")
	, _T("CP")
	, _T("CP-")
	, _T("MSDOS")
	, _T("MSDOS-")
};

/**
 * @brief Is string non-empty and comprised entirely of numbers?
 */
static bool
isNumeric(const CString & str)
{
	if (str.IsEmpty())
		return false;
	for (int i=0; i<str.GetLength(); ++i)
	{
		TCHAR ch = str[i];
		if (!_istascii(ch) || !_istdigit(ch))
			return false;
	}
	return true;
}

/**
 * @brief Try to to match codepage name from codepages module, & watch for f_wincp_prefixes aliases
 */
static int
FindEncodingIdFromNameOrAlias(CString encodingName)
{
	USES_CONVERSION;

	// Try name as given
	unsigned encodingId = GetEncodingIdFromName(T2CA(encodingName));
	if (encodingId) return encodingId;

	// Handle purely numeric values (codepages)
	if (isNumeric(encodingName))
	{
		unsigned codepage = _ttoi(encodingName);
		if (codepage)
			encodingId = GetEncodingIdFromCodePage(codepage);
		return encodingId;
	}

	for (int i=0; i<sizeof(f_wincp_prefixes)/sizeof(f_wincp_prefixes[0]); ++i)
	{
		// prefix is, eg, "WINDOWS-"
		CString prefix = f_wincp_prefixes[i];
		prefix.MakeUpper();
		// check if encodingName starts with prefix
		if (encodingName.GetLength() > prefix.GetLength())
		{
			CString encpref = encodingName.Left(prefix.GetLength());
			encpref.MakeUpper();
			if (prefix == encpref)
			{
				// encoding is, eg, "windows-1251"
				CString remainder = encodingName.Mid(prefix.GetLength());
				// remainder is, eg, "1251"
				if (isNumeric(remainder))
				{
					unsigned codepage = _ttoi(remainder);
					if (codepage)
						encodingId = GetEncodingIdFromCodePage(codepage);
					return encodingId;
				}
			}
		}
	}

	return 0; // failed
}

/**
 * @brief Parser for HTML files to find encoding information
 */
static unsigned demoGuessEncoding_html(const char *src, size_t len)
{
	CMarkdown markdown(src, src + len, CMarkdown::Html);
	//As <html> and <head> are optional, there is nothing to pull...
	//markdown.Move("html").Pop().Move("head").Pop();
	while (markdown.Move("meta"))
	{
		CMarkdown::String http_equiv = markdown.GetAttribute("http-equiv");
		if (http_equiv.A && lstrcmpiA(http_equiv.A, "content-type") == 0)
		{
			CMarkdown::String content = markdown.GetAttribute("content");
			if (char *pchKey = content.A)
			{
				while (int cchKey = strcspn(pchKey += strspn(pchKey, "; \t\r\n"), ";="))
				{
					char *pchValue = pchKey + cchKey;
					int cchValue = strcspn(pchValue += strspn(pchValue, "= \t\r\n"), "; \t\r\n");
					if (cchKey >= 7 && memicmp(pchKey, "charset", 7) == 0 && (cchKey == 7 || strchr(" \t\r\n", pchKey[7])))
					{
						pchValue[cchValue] = '\0';
						// Is it an encoding name known to charsets module ?
						unsigned encodingId = FindEncodingIdFromNameOrAlias(pchValue);
						if (encodingId)
						{
							return GetEncodingCodePageFromId(encodingId);
						}
						return 0;
					}
					pchKey = pchValue + cchValue;
				}
			}
		}
	}
	return 0;
}

/**
 * @brief Parser for XML files to find encoding information
 */
static unsigned demoGuessEncoding_xml(const char *src, size_t len)
{
	CMarkdown xml(src, src + len);
	if (xml.Move("?xml"))
	{
		CMarkdown::String encoding = xml.GetAttribute("encoding");
		if (encoding.A)
		{
			// Is it an encoding name we can find in charsets module ?
			unsigned encodingId = FindEncodingIdFromNameOrAlias(encoding.A);
			if (encodingId)
			{
				return GetEncodingCodePageFromId(encodingId);
			}
		}
	}
	return 0;
}

/**
 * @brief Parser for rc files to find encoding information
 * @note sscanf() requires first argument to be zero-terminated so we must
 * copy lines to temporary buffer.
 */
static unsigned demoGuessEncoding_rc(const char *src, size_t len)
{
	unsigned cp = 0;
	char line[80];
	do
	{
		while (len && (*src == '\r' || *src == '\n'))
		{
			++src;
			--len;
		}
		const char *base = src;
		while (len && *src != '\r' && *src != '\n')
		{
			++src;
			--len;
		}
		lstrcpynA(line, base, sizeof line);
	} while (len && sscanf(line, "#pragma code_page(%d)", &cp) != 1);
	return cp;
}

/**
 * @brief Try to deduce encoding for this file
 */
static unsigned GuessEncoding_from_bytes(LPCTSTR ext, const char *src, size_t len)
{
	if (len > 4096)
		len = 4096;
	unsigned cp = 0;
	if (lstrcmpi(ext, _T(".rc")) ==  0)
	{
		cp = demoGuessEncoding_rc(src, len);
	}
	else if (lstrcmpi(ext, _T(".htm")) == 0 || lstrcmpi(ext, _T(".html")) == 0)
	{
		cp = demoGuessEncoding_html(src, len);
	}
	else if (lstrcmpi(ext, _T(".xml")) == 0 || lstrcmpi(ext, _T(".xsl")) == 0)
	{
		cp = demoGuessEncoding_xml(src, len);
	}
	return cp;
}

/**
 * @brief Try to deduce encoding for this file
 */
bool GuessEncoding_from_bytes(LPCTSTR ext, const char **data, int count, FileTextEncoding * encoding)
{
	if (data)
	{
		const char *src = data[0];
		size_t len = data[count] - src;
		if (unsigned cp = GuessEncoding_from_bytes(ext, src, len))
		{
			encoding->Clear();
			encoding->SetCodepage(cp);
			return true;
		}
	}
	return false;
}

/**
 * @brief Try to deduce encoding for this file
 */
void GuessCodepageEncoding(LPCTSTR filepath, FileTextEncoding * encoding, BOOL bGuessEncoding)
{
	CMarkdown::FileImage fi(filepath, 4096);
	encoding->SetCodepage(getDefaultCodepage());
	encoding->m_bom = false;
	encoding->m_guessed = false;
	switch (fi.nByteOrder)
	{
	case 8 + 2 + 0:
		encoding->SetUnicoding(ucr::UCS2LE);
		encoding->m_bom = true;
		break;
	case 8 + 2 + 1:
		encoding->SetUnicoding(ucr::UCS2BE);
		encoding->m_bom = true;
		break;
	case 8 + 1:
		encoding->SetUnicoding(ucr::UTF8);
		encoding->m_bom = true;
		break;
	default:
		encoding->m_bom = false;
		break;

	}
	if (fi.nByteOrder == 1 && bGuessEncoding)
	{
		LPCTSTR ext = PathFindExtension(filepath);
		const char *src = (char *)fi.pImage;
		size_t len = fi.cbImage;
		if (unsigned cp = GuessEncoding_from_bytes(ext, src, len))
		{
			encoding->SetCodepage(cp);
			encoding->m_guessed = true;
		}
	}
}

About Koders | Resources | Downloads | Support | Black Duck | Terms of Service | DMCA | Privacy Policy | Contact Us