Delphi与AI自然语言处理:实现智能文本分析
自然语言处理(NLP)是人工智能的重要分支,专注于计算机与人类语言之间的交互。本文将探讨如何在Delphi应用中集成NLP技术,实现智能文本分析和处理功能。
自然语言处理简介
自然语言处理涉及多种技术和任务,主要包括:
- 文本分类:将文本分类到预定义的类别中
- 情感分析:识别文本中表达的情感和观点
- 命名实体识别:识别文本中的人名、地点、组织等实体
- 文本摘要:自动生成文本的简短摘要
- 机器翻译:在不同语言之间翻译文本
- 问答系统:理解问题并提供准确答案
- 文本生成:生成自然、连贯的文本
为什么在Delphi应用中集成NLP?
将NLP技术集成到Delphi应用中可以带来以下优势:
- 增强用户体验:通过自然语言交互提升用户体验
- 自动化文本处理:减少手动文本处理工作
- 提取有价值的见解:从大量文本数据中提取有用信息
- 个性化内容:根据用户偏好提供定制化内容
- 多语言支持:实现跨语言功能和服务
技术准备
在开始集成NLP之前,你需要准备以下内容:
- Delphi开发环境:建议使用Delphi 10.4或更高版本
- NLP库或API:如OpenAI API、Google Cloud Natural Language API或本地NLP库
- REST客户端组件:用于API调用,如Indy、REST Debugger或TRESTClient
- JSON处理库:如System.JSON或其他第三方JSON库
- 文本处理组件:用于基础文本处理和分析
基础:文本预处理
文本预处理是NLP的基础步骤,包括分词、去除停用词、词干提取等:
unit TextPreprocessor;
interface
uses
System.SysUtils, System.Classes, System.RegularExpressions,
System.Generics.Collections;
type
TTextPreprocessor = class
private
FStopWords: TStringList;
procedure LoadStopWords(const FileName: string);
public
constructor Create;
destructor Destroy; override;
function Tokenize(const Text: string): TArray<string>;
function RemoveStopWords(const Tokens: TArray<string>): TArray<string>;
function Stemming(const Token: string): string;
function Normalize(const Text: string): string;
function Preprocess(const Text: string): TArray<string>;
end;
implementation
{ TTextPreprocessor }
constructor TTextPreprocessor.Create;
begin
inherited;
FStopWords := TStringList.Create;
LoadStopWords('stopwords.txt');
end;
destructor TTextPreprocessor.Destroy;
begin
FStopWords.Free;
inherited;
end;
procedure TTextPreprocessor.LoadStopWords(const FileName: string);
begin
if FileExists(FileName) then
FStopWords.LoadFromFile(FileName)
else
begin
// 默认的中文停用词列表(简化版)
FStopWords.Add('的');
FStopWords.Add('了');
FStopWords.Add('是');
FStopWords.Add('在');
FStopWords.Add('我');
FStopWords.Add('有');
FStopWords.Add('和');
FStopWords.Add('就');
FStopWords.Add('不');
FStopWords.Add('人');
FStopWords.Add('都');
FStopWords.Add('一');
FStopWords.Add('上');
FStopWords.Add('也');
FStopWords.Add('很');
// 更多停用词...
end;
end;
function TTextPreprocessor.Tokenize(const Text: string): TArray<string>;
var
Tokens: TList<string>;
Token: string;
I: Integer;
begin
Tokens := TList<string>.Create;
try
// 简单的基于空格的分词(适用于英文)
// 对于中文,需要使用专门的分词算法或库
for Token in Text.Split([' ', ',', '.', '!', '?', ';', ':', '"', '''', '(', ')', '[', ']', '{', '}', #13, #10]) do
begin
Token := Token.Trim;
if not Token.IsEmpty then
Tokens.Add(Token);
end;
Result := Tokens.ToArray;
finally
Tokens.Free;
end;
end;
function TTextPreprocessor.RemoveStopWords(const Tokens: TArray<string>): TArray<string>;
var
FilteredTokens: TList<string>;
Token: string;
begin
FilteredTokens := TList<string>.Create;
try
for Token in Tokens do
begin
if FStopWords.IndexOf(Token.ToLower) < 0 then
FilteredTokens.Add(Token);
end;
Result := FilteredTokens.ToArray;
finally
FilteredTokens.Free;
end;
end;
function TTextPreprocessor.Stemming(const Token: string): string;
begin
// 简单的词干提取示例(英文)
// 实际应用中应使用专门的词干提取算法或库
Result := Token;
// 移除常见的后缀
if Token.EndsWith('ing') then
Result := Token.Substring(0, Token.Length - 3)
else if Token.EndsWith('ed') then
Result := Token.Substring(0, Token.Length - 2)
else if Token.EndsWith('s') then
Result := Token.Substring(0, Token.Length - 1);
end;
function TTextPreprocessor.Normalize(const Text: string): string;
begin
// 文本规范化:转换为小写,移除多余空格等
Result := Text.ToLower;
Result := TRegEx.Replace(Result, '\s+', ' ');
Result := Result.Trim;
end;
function TTextPreprocessor.Preprocess(const Text: string): TArray<string>;
var
NormalizedText: string;
Tokens, TokensWithoutStopWords: TArray<string>;
I: Integer;
begin
// 完整的预处理流程
NormalizedText := Normalize(Text);
Tokens := Tokenize(NormalizedText);
TokensWithoutStopWords := RemoveStopWords(Tokens);
SetLength(Result, Length(TokensWithoutStopWords));
for I := 0 to Length(TokensWithoutStopWords) - 1 do
Result[I] := Stemming(TokensWithoutStopWords[I]);
end;
end.
中级:使用云服务API进行情感分析
情感分析是NLP的常见应用,可以通过云服务API轻松实现:
unit SentimentAnalyzer;
interface
uses
System.SysUtils, System.Classes, System.Net.HttpClient, System.Net.URLClient,
System.JSON;
type
TSentimentResult = record
Score: Double;
Magnitude: Double;
Sentiment: string;
constructor Create(AScore, AMagnitude: Double);
end;
TSentimentAnalyzer = class
private
FApiKey: string;
FEndpoint: string;
FHttpClient: THTTPClient;
public
constructor Create(const ApiKey: string);
destructor Destroy; override;
function AnalyzeSentiment(const Text: string): TSentimentResult;
function AnalyzeSentimentBatch(const Texts: TArray<string>): TArray<TSentimentResult>;
end;
implementation
{ TSentimentResult }
constructor TSentimentResult.Create(AScore, AMagnitude: Double);
begin
Score := AScore;
Magnitude := AMagnitude;
// 根据分数确定情感
if Score >= 0.25 then
Sentiment := '积极'
else if Score <= -0.25 then
Sentiment := '消极'
else
Sentiment := '中性';
end;
{ TSentimentAnalyzer }
constructor TSentimentAnalyzer.Create(const ApiKey: string);
begin
inherited Create;
FApiKey := ApiKey;
FEndpoint := 'https://language.googleapis.com/v1/documents:analyzeSentiment';
FHttpClient := THTTPClient.Create;
end;
destructor TSentimentAnalyzer.Destroy;
begin
FHttpClient.Free;
inherited;
end;
function TSentimentAnalyzer.AnalyzeSentiment(const Text: string): TSentimentResult;
var
RequestObj, DocumentObj: TJSONObject;
Response: IHTTPResponse;
ResponseContent: string;
ResponseObj, SentimentObj: TJSONObject;
Score, Magnitude: Double;
begin
RequestObj := TJSONObject.Create;
DocumentObj := TJSONObject.Create;
try
// 构建请求
DocumentObj.AddPair('type', 'PLAIN_TEXT');
DocumentObj.AddPair('content', Text);
RequestObj.AddPair('document', DocumentObj);
RequestObj.AddPair('encodingType', 'UTF8');
// 设置请求头
FHttpClient.CustomHeaders['Content-Type'] := 'application/json';
// 发送请求
Response := FHttpClient.Post(FEndpoint + '?key=' + FApiKey,
TStringStream.Create(RequestObj.ToJSON), nil);
ResponseContent := Response.ContentAsString;
// 解析响应
if Response.StatusCode = 200 then
begin
ResponseObj := TJSONObject.ParseJSONValue(ResponseContent) as TJSONObject;
try
if ResponseObj.GetValue('documentSentiment') <> nil then
begin
SentimentObj := ResponseObj.GetValue('documentSentiment') as TJSONObject;
Score := SentimentObj.GetValue<Double>('score');
Magnitude := SentimentObj.GetValue<Double>('magnitude');
Result := TSentimentResult.Create(Score, Magnitude);
end
else
Result := TSentimentResult.Create(0, 0);
finally
ResponseObj.Free;
end;
end
else
raise Exception.CreateFmt('API错误: %d - %s', [Response.StatusCode, ResponseContent]);
finally
RequestObj.Free;
end;
end;
function TSentimentAnalyzer.AnalyzeSentimentBatch(const Texts: TArray<string>): TArray<TSentimentResult>;
var
I: Integer;
begin
SetLength(Result, Length(Texts));
for I := 0 to Length(Texts) - 1 do
Result[I] := AnalyzeSentiment(Texts[I]);
end;
end.
高级:命名实体识别
命名实体识别(NER)用于从文本中提取人名、地点、组织等实体:
unit EntityRecognizer;
interface
uses
System.SysUtils, System.Classes, System.Net.HttpClient, System.Net.URLClient,
System.JSON, System.Generics.Collections;
type
TEntityType = (etUnknown, etPerson, etLocation, etOrganization, etEvent,
etProduct, etDate, etNumber, etPrice, etOther);
TEntity = record
Text: string;
EntityType: TEntityType;
Salience: Double;
BeginOffset: Integer;
EndOffset: Integer;
constructor Create(const AText: string; AType: TEntityType; ASalience: Double;
ABeginOffset, AEndOffset: Integer);
end;
TEntityRecognizer = class
private
FApiKey: string;
FEndpoint: string;
FHttpClient: THTTPClient;
function EntityTypeFromString(const TypeStr: string): TEntityType;
public
constructor Create(const ApiKey: string);
destructor Destroy; override;
function RecognizeEntities(const Text: string): TArray<TEntity>;
end;
implementation
{ TEntity }
constructor TEntity.Create(const AText: string; AType: TEntityType; ASalience: Double;
ABeginOffset, AEndOffset: Integer);
begin
Text := AText;
EntityType := AType;
Salience := ASalience;
BeginOffset := ABeginOffset;
EndOffset := AEndOffset;
end;
{ TEntityRecognizer }
constructor TEntityRecognizer.Create(const ApiKey: string);
begin
inherited Create;
FApiKey := ApiKey;
FEndpoint := 'https://language.googleapis.com/v1/documents:analyzeEntities';
FHttpClient := THTTPClient.Create;
end;
destructor TEntityRecognizer.Destroy;
begin
FHttpClient.Free;
inherited;
end;
function TEntityRecognizer.EntityTypeFromString(const TypeStr: string): TEntityType;
begin
if TypeStr = 'PERSON' then
Result := etPerson
else if TypeStr = 'LOCATION' then
Result := etLocation
else if TypeStr = 'ORGANIZATION' then
Result := etOrganization
else if TypeStr = 'EVENT' then
Result := etEvent
else if TypeStr = 'CONSUMER_GOOD' then
Result := etProduct
else if TypeStr = 'DATE' then
Result := etDate
else if TypeStr = 'NUMBER' then
Result := etNumber
else if TypeStr = 'PRICE' then
Result := etPrice
else if TypeStr = 'OTHER' then
Result := etOther
else
Result := etUnknown;
end;
function TEntityRecognizer.RecognizeEntities(const Text: string): TArray<TEntity>;
var
RequestObj, DocumentObj: TJSONObject;
Response: IHTTPResponse;
ResponseContent: string;
ResponseObj: TJSONObject;
EntitiesArray, MentionsArray: TJSONArray;
EntityObj, MentionObj: TJSONObject;
Entities: TList<TEntity>;
EntityType: TEntityType;
EntityText: string;
Salience: Double;
BeginOffset, EndOffset: Integer;
I, J: Integer;
begin
RequestObj := TJSONObject.Create;
DocumentObj := TJSONObject.Create;
Entities := TList<TEntity>.Create;
try
// 构建请求
DocumentObj.AddPair('type', 'PLAIN_TEXT');
DocumentObj.AddPair('content', Text);
RequestObj.AddPair('document', DocumentObj);
RequestObj.AddPair('encodingType', 'UTF8');
// 设置请求头
FHttpClient.CustomHeaders['Content-Type'] := 'application/json';
// 发送请求
Response := FHttpClient.Post(FEndpoint + '?key=' + FApiKey,
TStringStream.Create(RequestObj.ToJSON), nil);
ResponseContent := Response.ContentAsString;
// 解析响应
if Response.StatusCode = 200 then
begin
ResponseObj := TJSONObject.ParseJSONValue(ResponseContent) as TJSONObject;
try
if ResponseObj.GetValue('entities') <> nil then
begin
EntitiesArray := ResponseObj.GetValue('entities') as TJSONArray;
for I := 0 to EntitiesArray.Count - 1 do
begin
EntityObj := EntitiesArray.Items[I] as TJSONObject;
EntityText := EntityObj.GetValue<string>('name');
EntityType := EntityTypeFromString(EntityObj.GetValue<string>('type'));
Salience := EntityObj.GetValue<Double>('salience');
// 获取实体提及信息
if EntityObj.GetValue('mentions') <> nil then
begin
MentionsArray := EntityObj.GetValue('mentions') as TJSONArray;
for J := 0 to MentionsArray.Count - 1 do
begin
MentionObj := MentionsArray.Items[J] as TJSONObject;
if MentionObj.GetValue('text') <> nil then
begin
BeginOffset := (MentionObj.GetValue('text') as TJSONObject).GetValue<Integer>('beginOffset');
EndOffset := BeginOffset + EntityText.Length;
Entities.Add(TEntity.Create(EntityText, EntityType, Salience, BeginOffset, EndOffset));
end;
end;
end;
end;
end;
finally
ResponseObj.Free;
end;
end
else
raise Exception.CreateFmt('API错误: %d - %s', [Response.StatusCode, ResponseContent]);
Result := Entities.ToArray;
finally
RequestObj.Free;
Entities.Free;
end;
end;
end.
实际应用案例
1. 智能客户反馈分析
// 使用NLP分析客户反馈
procedure TFeedbackAnalysisForm.AnalyzeFeedback;
var
Preprocessor: TTextPreprocessor;
SentimentAnalyzer: TSentimentAnalyzer;
EntityRecognizer: TEntityRecognizer;
Feedback, ProcessedFeedback: string;
Tokens: TArray<string>;
SentimentResult: TSentimentResult;
Entities: TArray<TEntity>;
Entity: TEntity;
I: Integer;
begin
Feedback := memFeedback.Text;
if Feedback.Trim.IsEmpty then
Exit;
Preprocessor := TTextPreprocessor.Create;
SentimentAnalyzer := TSentimentAnalyzer.Create(ConfigManager.GetValue('GOOGLE_API_KEY'));
EntityRecognizer := TEntityRecognizer.Create(ConfigManager.GetValue('GOOGLE_API_KEY'));
try
// 预处理文本
Tokens := Preprocessor.Preprocess(Feedback);
ProcessedFeedback := String.Join(' ', Tokens);
// 情感分析
SentimentResult := SentimentAnalyzer.AnalyzeSentiment(Feedback);
// 实体识别
Entities := EntityRecognizer.RecognizeEntities(Feedback);
// 显示结果
memResults.Lines.Add('情感分析结果:');
memResults.Lines.Add(Format(' 情感: %s', [SentimentResult.Sentiment]));
memResults.Lines.Add(Format(' 分数: %.2f', [SentimentResult.Score]));
memResults.Lines.Add(Format(' 强度: %.2f', [SentimentResult.Magnitude]));
memResults.Lines.Add('');
memResults.Lines.Add('识别的实体:');
for Entity in Entities do
begin
memResults.Lines.Add(Format(' 文本: %s', [Entity.Text]));
memResults.Lines.Add(Format(' 类型: %s', [GetEntityTypeName(Entity.EntityType)]));
memResults.Lines.Add(Format(' 重要性: %.2f', [Entity.Salience]));
memResults.Lines.Add('');
end;
// 根据分析结果分类反馈
ClassifyFeedback(SentimentResult, Entities);
finally
Preprocessor.Free;
SentimentAnalyzer.Free;
EntityRecognizer.Free;
end;
end;
2. 智能文档分类器
// 使用NLP进行文档分类
function TDocumentClassifier.ClassifyDocument(const DocumentText: string): TArray<string>;
var
LLMClient: TLLMClient;
Prompt, Response: string;
Categories: TStringList;
begin
LLMClient := TLLMClient.Create(ConfigManager.GetValue('OPENAI_API_KEY'), lpOpenAI);
Categories := TStringList.Create;
try
LLMClient.Model := 'gpt-3.5-turbo';
Prompt := Format(
'请将以下文档分类到最合适的类别中。可以选择多个类别,但不要超过3个。' + sLineBreak +
'可用的类别有:技术文档、市场营销、财务报告、法律文件、产品说明、客户服务、内部通讯、研究报告、教育材料、其他' + sLineBreak +
'只返回类别名称,用逗号分隔,不要有其他文字。' + sLineBreak + sLineBreak +
'文档内容:' + sLineBreak +
'%s',
[DocumentText]
);
Response := LLMClient.SendMessage(Prompt);
// 解析响应
Categories.CommaText := Response;
Result := Categories.ToStringArray;
finally
LLMClient.Free;
Categories.Free;
end;
end;
3. 多语言翻译器
// 使用NLP进行文本翻译
function TTranslator.TranslateText(const Text, SourceLanguage, TargetLanguage: string): string;
var
HttpClient: THTTPClient;
RequestObj: TJSONObject;
Response: IHTTPResponse;
ResponseContent: string;
ResponseObj: TJSONObject;
TranslationsArray: TJSONArray;
begin
Result := '';
HttpClient := THTTPClient.Create;
RequestObj := TJSONObject.Create;
try
// 构建请求
RequestObj.AddPair('q', Text);
RequestObj.AddPair('source', SourceLanguage);
RequestObj.AddPair('target', TargetLanguage);
RequestObj.AddPair('format', 'text');
// 设置请求头
HttpClient.CustomHeaders['Content-Type'] := 'application/json';
HttpClient.CustomHeaders['Authorization'] := 'Bearer ' + ConfigManager.GetValue('GOOGLE_API_KEY');
// 发送请求
Response := HttpClient.Post('https://translation.googleapis.com/language/translate/v2',
TStringStream.Create(RequestObj.ToJSON), nil);
ResponseContent := Response.ContentAsString;
// 解析响应
if Response.StatusCode = 200 then
begin
ResponseObj := TJSONObject.ParseJSONValue(ResponseContent) as TJSONObject;
try
if (ResponseObj.GetValue('data') <> nil) and
((ResponseObj.GetValue('data') as TJSONObject).GetValue('translations') <> nil) then
begin
TranslationsArray := (ResponseObj.GetValue('data') as TJSONObject).GetValue('translations') as TJSONArray;
if TranslationsArray.Count > 0 then
Result := (TranslationsArray.Items[0] as TJSONObject).GetValue<string>('translatedText');
end;
finally
ResponseObj.Free;
end;
end
else
raise Exception.CreateFmt('API错误: %d - %s', [Response.StatusCode, ResponseContent]);
finally
HttpClient.Free;
RequestObj.Free;
end;
end;
最佳实践与注意事项
- 性能考虑:NLP处理可能需要大量计算资源,考虑使用异步处理
- API限制:了解并遵守API提供商的使用限制和计费政策
- 多语言支持:确保NLP解决方案支持目标语言,特别是中文等非英语语言
- 隐私保护:处理用户文本数据时注意隐私保护
- 结果验证:NLP结果可能不完全准确,实现适当的验证和纠错机制
- 用户反馈:收集用户反馈,不断改进NLP功能
结论
通过在Delphi应用中集成自然语言处理技术,我们可以实现智能文本分析、情感识别、实体提取等高级功能,显著提升应用的智能化水平和用户体验。无论是使用云服务API还是本地NLP库,Delphi开发者都有多种选择来实现这些功能。
随着NLP技术的不断发展和成熟,我们可以期待更多创新的NLP应用出现在Delphi开发的软件中。通过持续学习和实践,Delphi开发者可以充分利用这些技术,创建更智能、更有价值的应用程序。
关于作者:付乙,资深Delphi开发者,专注于将现代技术与传统应用相结合,提升软件价值和用户体验。