VintaSoft Imaging .NET SDK 14.0: Documentation for .NET developer
Vintasoft.Imaging.Text Namespace / TextRegion Class / GetWords Methods / GetWords(Predicate<String>,Predicate<String>,Predicate<String>) Method
Syntax Example Requirements SeeAlso
In This Topic
    GetWords(Predicate<String>,Predicate<String>,Predicate<String>) Method (TextRegion)
    In This Topic
    Returns the words of this text region.
    Syntax

    Parameters

    wordCharacterPredicate
    The word character predicate that determinates allowable characters in words.
    wordDelimiterPredicate
    The word delimiter predicate that determinates word delimiters.
    whiteSpaceCharacterPredicate
    The white space character predicate that determinates white space characters.

    Return Value

    An array of TextRegion, which defines words.
    Example

    This C#/VB.NET code shows how to extract only words, which are consist from letters and digits, from PDF page.

    
    ''' <summary>
    ''' Determines whether specified symbol is word character.
    ''' </summary>
    ''' <param name="symbol">The symbol.</param>
    Private Shared Function IsWordCharacter(symbol As String) As Boolean
        If Vintasoft.Imaging.Text.UnicodeCharacterCollection.IsUtf32Symbol(symbol, 0) Then
            Return True
        End If
        Return Char.IsLetterOrDigit(symbol(0)) OrElse symbol = "_"
    End Function
    
    ''' <summary>
    ''' Determines whether specified symbol is punctuation.
    ''' </summary>
    ''' <param name="symbol">The symbol.</param>
    Private Shared Function IsPunctuation(symbol As String) As Boolean
        If Vintasoft.Imaging.Text.UnicodeCharacterCollection.IsUtf32Symbol(symbol, 0) Then
            Return False
        End If
        Return Char.IsPunctuation(symbol(0))
    End Function
    
    ''' <summary>
    ''' Determines whether specified symbol is white space.
    ''' </summary>
    ''' <param name="symbol">The symbol.</param>
    Private Shared Function IsWhiteSpace(symbol As String) As Boolean
        If Vintasoft.Imaging.Text.UnicodeCharacterCollection.IsUtf32Symbol(symbol, 0) Then
            Return False
        End If
        Return Char.IsWhiteSpace(symbol(0))
    End Function
    
    ''' <summary>
    ''' Extract words, which are consist from letters and digits, from PDF page.
    ''' </summary>
    ''' <param name="page">PDF page.</param>
    ''' <returns>Words, which are consist from letters and digits, from PDF page.</returns>
    Public Shared Function GetLetterAndDigitWordsFromPdfPage(page As Vintasoft.Imaging.Pdf.Tree.PdfPage) As String
        ' get words of the page
        Dim words As Vintasoft.Imaging.Text.TextRegion() = page.TextRegion.GetWords(AddressOf IsWordCharacter, AddressOf IsPunctuation, AddressOf IsWhiteSpace)
    
        Dim result As New System.Text.StringBuilder()
        For Each word As Vintasoft.Imaging.Text.TextRegion In words
            result.AppendLine(word.TextContent)
        Next
    
        Return result.ToString()
    End Function
    
    
    
    /// <summary>
    /// Determines whether specified symbol is word character.
    /// </summary>
    /// <param name="symbol">The symbol.</param>
    private static bool IsWordCharacter(string symbol)
    {
        if (Vintasoft.Imaging.Text.UnicodeCharacterCollection.IsUtf32Symbol(symbol, 0))
            return true;
        return char.IsLetterOrDigit(symbol[0]) || symbol == "_";
    }
    
    /// <summary>
    /// Determines whether specified symbol is punctuation.
    /// </summary>
    /// <param name="symbol">The symbol.</param>
    private static bool IsPunctuation(string symbol)
    {
        if (Vintasoft.Imaging.Text.UnicodeCharacterCollection.IsUtf32Symbol(symbol, 0))
            return false;
        return char.IsPunctuation(symbol[0]);
    }
    
    /// <summary>
    /// Determines whether specified symbol is white space.
    /// </summary>
    /// <param name="symbol">The symbol.</param>
    private static bool IsWhiteSpace(string symbol)
    {
        if (Vintasoft.Imaging.Text.UnicodeCharacterCollection.IsUtf32Symbol(symbol, 0))
            return false;
        return char.IsWhiteSpace(symbol[0]);
    }
    
    /// <summary>
    /// Extract words, which are consist from letters and digits, from PDF page.
    /// </summary>
    /// <param name="page">PDF page.</param>
    /// <returns>Words, which are consist from letters and digits, from PDF page.</returns>
    public static string GetLetterAndDigitWordsFromPdfPage(Vintasoft.Imaging.Pdf.Tree.PdfPage page)
    {
        // get words of the page
        Vintasoft.Imaging.Text.TextRegion[] words =
            page.TextRegion.GetWords(IsWordCharacter, IsPunctuation, IsWhiteSpace);
    
        System.Text.StringBuilder result = new System.Text.StringBuilder();
        foreach (Vintasoft.Imaging.Text.TextRegion word in words)
        {
            result.AppendLine(word.TextContent);
        }
    
        return result.ToString();
    }
    
    

    Requirements

    Target Platforms: .NET9; .NET 8; .NET 7; .NET 6; .NET Framework 4.8, 4.7, 4.6, 4.5, 4.0, 3.5

    See Also