2011年6月28日星期二

CSet模板类

template < typename itemType >
int fnCompare( itemType *pitem1, itemType *pitem2 )
{
return pitem1->fnCompare( pitem2 );
};

template
class CSet
{
public:
int m_iMaxNumItems;
int m_iNumIncrementalItems;
int m_iNumItems;
itemType **m_paItems;

public:
CSet( int iMaxNumItems, int iNumIncrementalItems )
{
m_iMaxNumItems = iMaxNumItems;
m_iNumIncrementalItems = iNumIncrementalItems;
m_iNumItems = 0;

int I;

m_paItems = new itemType * [ m_iMaxNumItems ];
for( I = 0; I < m_iMaxNumItems; I ++ )
{
m_paItems[ I ] = NULL;
}
}
~CSet( )
{
int I;

for( I = 0; I < m_iNumItems; I ++ )
{
delete m_paItems[ I ];
m_paItems[ I ] = NULL;
}
delete [] m_paItems;
}

void fnClear( void )
{
int I;
for( I = 0; I < m_iNumItems; I ++ ){
delete m_paItems[ I ];
m_paItems[ I ] = NULL;
}
m_iNumItems = 0;
}

bool fnInsert( itemType *pItem )
{
if( m_iNumItems == 0
|| ( m_iNumItems < m_iMaxNumItems && fnCompare( m_paItems[ m_iNumItems - 1 ], pItem ) ) )
{
m_paItems[ m_iNumItems ] = pItem;

m_iNumItems++;
return true;
}

int I;

//insert the item
int iB, iE, iM;
float fC = -1;
iB = 0;
iE = m_iNumItems - 1;
iM = (iB + iE ) / 2;
do{
if( iE < iB )
break;

iM = (iB + iE ) / 2;
fC = fnCompare( m_paItems[ iM ], pItem );
if( fC == 0 )
{
iE = iM;
break;
}
if( fC < 0 )
iE = iM - 1;
if( fC > 0 )
iB = iM + 1;
}while( true );


if( fC > 0 )
iM ++;//iM is the position need to insert

if( m_iMaxNumItems == m_iNumItems )
{ //when the table is full
if( m_iNumIncrementalItems <= 0 )
{ //no incremental items are allowed
if ( iM >= m_iNumItems )
return false;//
delete m_paItems[ m_iNumItems - 1 ];
for( I = m_iNumItems - 2; I >= iM; I-- )
{
m_paItems[ I + 1 ] = m_paItems[ I ];
}
m_paItems[ I + 1 ] = pItem;
return true;
}
else
{ //incremental items are allowed

//increase the table size
itemType **paNewItems = new itemType * [ m_iMaxNumItems + m_iNumIncrementalItems ];
for( I = 0; I < m_iMaxNumItems; I ++ )
{
paNewItems[ I ] = m_paItems[ I ];
}

for( I = m_iMaxNumItems; I < m_iMaxNumItems + m_iNumIncrementalItems; I ++ )
paNewItems[ I ] = NULL;

delete [] m_paItems;
m_paItems = paNewItems;

m_iMaxNumItems += m_iNumIncrementalItems;
}
}

for( I = m_iNumItems - 1; I >= iM; I -- )
{
m_paItems[ I + 1 ] = m_paItems[ I ];
}
m_paItems[ I + 1 ] = pItem;

m_iNumItems ++;
return true;
}

bool fnDelete( int iBeginItemID, int iNumDeleteItems )
{
int I;

if( ( iBeginItemID < 0 )
|| ( iBeginItemID >= m_iNumItems )
|| ( iNumDeleteItems < 0 ) )
return false;
if( ( iBeginItemID + iNumDeleteItems ) > m_iNumItems )
iNumDeleteItems = m_iNumItems - iBeginItemID;

for( I = 0; I < iNumDeleteItems; I ++ ){
delete m_paItems[ iBeginItemID + I ];
m_paItems[ iBeginItemID + I ] = NULL;
}
for( I = iBeginItemID; I < m_iNumItems - iNumDeleteItems; I ++ ){
m_paItems[ I ] = m_paItems[ I + iNumDeleteItems ];
m_paItems[ I + iNumDeleteItems ] = NULL;
}
m_iNumItems -= iNumDeleteItems;
return true;
}
};

实例化CSet的类itemType需要实现fnCompare( itemType pitem2 )方法.

2011年6月26日星期日

内存泄漏检测

Linux下C++代码内存泄漏检测:

valgrind --leak-check=full ./parser

之前需要安装valgrind (sudo apt-get install valgrind)

mac上安装valgrind可参考
http://valgrind.org/downloads/repository.html

The Current (3.0 and later) Repository

To check out code from the current repository (anonymous, read-only SVN access), do this:
  svn co svn://svn.valgrind.org/valgrind/trunk valgrind
To build the checked out code, follow the instructions in the README file that the checkout should give you. Alternatively, the following should work:
  cd valgrind
  ./autogen.sh
  ./configure --prefix=...
  make
  make install
To do the checkout, you'll need a Subversion client, version 1.1.0 or later. Versions prior to 1.1.0 do not properly handle the symbolic links in our tree.
To do the build, you'll need automake version 1.10 or later and a compatible version of autoconf (e.g. 2.68). These should come as standard on any non-ancient Linux distribution.

2011年6月20日星期一

Moses的一些实现细节

Moses的一些实现细节

使用默认的参数训练,

# distortion (reordering) weight
[weight-d]
0.010878 //distance weight
0.059869 //monotone weight (f ---> e direction)
0.033513 //swap weight (f-->e direction)
0.216945 //discontinuous weight (f-->e direction)
0.057195 //monotone weight (e---> f direction)
0.037017 //swap weight (e-->f direction)
0.111842 //discontinuous weight (f-->e direction)

#distortion样例
国际 ||| in the world ||| 0.016393 0.180328 0.803279 0.016393 0.016393 0.967213
6个特征值分别表示分别对应monotone, swap, discontinuous (f-->e) direction, monotone, swap, discontinuous (e-->f) direction

在解码过程中, 采用stack decoding算法, 每形成一个短语对时, 只能推断f-->e方向的orientation, e-->f方向的orientation需要到下一个短语对形成时计算 (句子最后一个短语对e-->f方向的orientation不计算). oreientation取值只能为monotone, swap, discontinuous中的某一项, 其他两项的特征值置0

# translation model weights
[weight-t]
0.008391 //P(f|e) weight
0.017675 //Lexical(f|e) weight
0.043903 //P(e|f) weight
0.029038 //Lexical(e|f) weight
0.074607 //Phrase Penalty weight

phrase table样例
国际 ||| in the world ||| 0.0174804 0.0214346 0.00197252 7.68173e-05 2.718
5个特征值分别对应以上5项


对未登录词的处理
未登录词的权重为1.0, 特征值为-100. 未登录词的翻译结果为其本身, 但# distortion (reordering) 6个特征值为0, translation model的5个特征值为0.. language mode特征值由lm定.. 以下是未登录词的特征值及权重的一个例子:
score:0.000000 weight:0.010878 //distance feature
score:-1.000000 weight:-0.236574 //word penalty feature
score:-100.000000 weight:1.000000 //unknown word feature
score:0.000000 weight:0.059869 //monotony f-->e
score:0.000000 weight:0.033513 //swap
score:0.000000 weight:0.216945 //discontinuous
score:0.000000 weight:0.057195 //monotony e-->f
score:0.000000 weight:0.037017 //swap
score:0.000000 weight:0.111842 //discontinuous
score:-6.271241 weight:0.062555 //language model
score:0.000000 weight:0.008391 //translation (f|e)
score:0.000000 weight:0.017675 //lexical (f|e)
score:0.000000 weight:0.043903 //translation (e|f)
score:0.000000 weight:0.029038 //lexical (e|f)
score:0.000000 weight:0.074607 //phrase penalty


Future value的计算....
Future value计算考虑的feature包括word penalty feature, unknown word feature, language model, 以及5种translation features; 即不包括distance feature以及各种distortion reordering features

在对hypothesis expanding的时候, 例如输入句子, 此时已翻译的单词为0:
( 海啸 救灾 ) 中国 国际 广播 电台 与 中华 慈善 总会 将 联合 举办 广播 赈灾 义演
“中国 国际 广播”并不作为一个源语言的phrase, moses给的解释是(参见SearchNormal.cpp):
// the basic idea is this: we would like to translate a phrase starting
// from a position further right than the left-most open gap. The
// distortion penalty for the following phrase will be computed relative
// to the ending position of the current extension, so we ask now what
// its maximum value will be (which will always be the value of the
// hypothesis starting at the left-most edge). If this value is less than
// the distortion limit, we don't allow this extension to be made.
计算出来的distortion值为7, 大于默认的distortion limit值6.

BeamSearch的Beamwidth设置为-11.5129251, 即ln(0.00001), 用于存放hypothesis的stack并没有限制其大小.

stack的数据结构为stl set容器, 不清楚各元素是按什么进行排序??

Recombining的条件是与set容器内的某个元素值相同, 但不清楚元素值相同的条件什么??
1) 首先已翻译的单词一致;
2) 最近翻译的f端短语终止位置一致
3) 最近翻译的短语对在f-->e方向上的orientation一致并且特征值也一致?.
??与n-gram无关?

抽取规则时短语的最大长度默认为7 (源端和目标端), 可以max-phrase-length设置

2011年6月14日星期二

perl读文件

写的第一个perl程序, 读文件中的每行
运行需在命令行前加perl, 如perl *.pl

#!/usr/bin/perl -w

my $strfilename = '/home/ldd/Desktop/list.txt';

my $strline;
my $strcommand;

open( FILE, $strfilename ) || die "count not read from $strfilename";

while ( $strline=<FILE>)
{
chomp ( $strline );
$strcommand = '/home/ldd/toolkit/charniak/PARSE/parseIt -K -LCh -l200 /home/ldd/toolkit/charniak/ctb5.1/ /home/ldd/Desktop/ACE2005CN/'.$strline.' > /home/ldd/Desktop/ACE2005CN/'.$strline.'.charniak';
print $strcommand."\n";
system( $strcommand );
}
close FILE;

2011年6月7日星期二

fbis语料预处理
中文:
第69243句子包含英文句子"whatisfirmlyestablishedcannotbeuprooted , whatisfirmlygraspedcannotslipaway , itwillbehonoredfromgenerationtogeneration", 手工删除

英文:


nist语料预处理
中文:

英文:
需将一些非英文字母替换, 例如:
Jürgen Sandkühler --> Jurgen Sandkuhler

2011年6月5日星期日

(C/C++)UTF8字符串中字的切分

UTF-8 采用变长度字节来表示字符,理论上最多可以到 6 个字节长度。UTF-8 编码兼容了 ASC II(0-127), 也就是说 UTF-8 对于 ASC II 字符的编码是和 ASC II 一样的。对于超过一个字节长度的字符,才用以下编码规范:
左边第一个字节1的个数表示这个字符编码字节的位数,例如两位字节字符编码样式为为:110xxxxx 10xxxxxx; 三位字节字符的编码样式为:1110xxxx 10xxxxxx 10xxxxxx.;以此类推,六位字节字符的编码样式为:1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx。 xxx 的值由字符编码的二进制表示的位填入。

1字节:0xxxxxxx
2字节:110xxxxx 10xxxxxx
3字节:1110xxxx 10xxxxxx 10xxxxxx
4字节:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5字节:111110xx 10xxxxxx 10xxxxxx 10xxxxxx
6字节:1111110x 10xxxxxx 10xxxxxx 10xxxxxx


void fnReadCharactersUTF8( char *pszSentence, vector& vec )
{
int iLen;
iLen = strlen( pszSentence );

char *p;

p = pszSentence;

unsigned char *q;

char szCharacter[ 101 ];
int iChar;

int iNumChars;
iNumChars = 0;

vec.clear( );

string strCharacter;

while( p != NULL && strlen( p ) > 0 )
{
q = (unsigned char *)p;
if ( q[ 0 ] < 0x80 )
{
//p[ 0 ] must be an ASCII character
iChar = 0;
szCharacter[ iChar++ ] = p[ 0 ];
p++;
q = (unsigned char *)p;
while( p != NULL && q[ 0 ] < 0x80 )
{
szCharacter[ iChar++ ] = p[ 0 ];
p++;
q = (unsigned char *)p;
}
szCharacter[ iChar ] = '\0';

vec.push_back( string( szCharacter ) );

iNumChars++;
}
else if ( q[ 0 ] < 0xC0 )
{
//invalid char between 0x80 and 0xC0
p++;
}
else if ( q[ 0 ] < 0xE0 )
{
//two chars
szCharacter[ 0 ] = p[ 0 ];
szCharacter[ 1 ] = p[ 1 ];
szCharacter[ 2 ] = '\0';
p = p + 2;

strCharacter = string( szCharacter );
vec.push_back( strCharacter );

iNumChars++;
}
else if ( q[ 0 ] < 0xF0 )
{
//three chars
szCharacter[ 0 ] = p[ 0 ];
szCharacter[ 1 ] = p[ 1 ];
szCharacter[ 2 ] = p[ 2 ];
szCharacter[ 3 ] = '\0';
p = p + 3;

strCharacter = string( szCharacter );
vec.push_back( strCharacter );

//printf( "%s ", strCharacter.c_str( ) );

iNumChars++;
}
else if ( q[ 0 ] < 0xF8 )
{
//four chars
p += 4;
}
else if ( q[ 0 ] < 0xFC )
{
//five chars
p += 5;
}
else if ( q[0] < 0xFE )
{
//6 chars
p += 5;
}
else
{
//>=0xFE
p++;
}
}
}