首页 > 解决方案 > 如何处理 c 中的换行符和分隔符间距?

问题描述

在此处输入图像描述

实际的文本文件只是用于测试 lex 和解析的随机内容。上面的图片是结果,控制台在运行时给我。在绿色中,它应该是换行符或分隔符时调用标识符,因此不需要任何内容​​。在红色中它没有识别分隔符,在黄色中它根本没有读取 something.something。我假设它与前面的 c 有关系;没有正确分开。

所以我的问题是如何正确分离标记,并识别换行符,或者我做错了什么。下面是我用来进行分离和标记化的代码。

#define _CRT_SECURE_NO_WARNINGS
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define BUFFER_SIZE    1024

// Returns 'true' if the character is a DELIMITER.
bool isDelimiter(char ch)
{
    if (isspace (ch))
        return (true);
    return (false);
}

// Returns 'true' if the character is a SEPERATOR.
bool isSeperator(char str)
{
    if (str == ',' || str == ';' || str == '>' ||
        str == '<' || str == '(' || str == ')' || str == '[' || str == ']' || 
        str == '{' || str == '}' || str == '.' )
        return (true);
    return (false);
}

// Returns 'true' if the character is an OPERATOR.
bool isOperator(char ch)
{
    if (ch == '+' || ch == '-' || ch == '*' ||
        ch == '/' || ch == '>' || ch == '<' ||
        ch == '=')
        return (true);
    return (false);
}

// Returns 'true' if the string is a VALID IDENTIFIER.
bool validIdentifier(char* str)
{
    if (str[0] == '0' || str[0] == '1' || str[0] == '2' ||
        str[0] == '3' || str[0] == '4' || str[0] == '5' ||
        str[0] == '6' || str[0] == '7' || str[0] == '8' ||
        str[0] == '9' )
        return (false);
    return (true);
}

// Returns 'true' if the string is a KEYWORD.
bool isKeyword(char* str)
{
    if (!strcmp(str, "if") || !strcmp(str, "else") ||
        !strcmp(str, "while") || !strcmp(str, "do") ||
        !strcmp(str, "break") || !strcmp(str, "elem") ||
        !strcmp(str, "lout") || !strcmp(str, "file") ||
        !strcmp(str, "console") || !strcmp(str, "read") ||
        !strcmp(str, "write") || !strcmp(str, "mark") ||
        !strcmp(str, "emblemnize") || !strcmp(str, "lin") ||
        !strcmp(str, "send") || !strcmp(str, "dint") ||
        !strcmp(str, "continue") || !strcmp(str, "int")
        || !strcmp(str, "double") || !strcmp(str, "float")
        || !strcmp(str, "return") || !strcmp(str, "char")
        || !strcmp(str, "case") || !strcmp(str, "char")
        || !strcmp(str, "sizeof") || !strcmp(str, "long")
        || !strcmp(str, "short") || !strcmp(str, "typedef")
        || !strcmp(str, "switch") || !strcmp(str, "unsigned")
        || !strcmp(str, "void") || !strcmp(str, "static")
        || !strcmp(str, "struct") || !strcmp(str, "goto"))
        return (true);
    return (false);
}

// Returns 'true' if the string is an INTEGER.
bool isInteger(char* str)
{
    int i, len = strlen(str);

    if (len == 0)
        return (false);
    for (i = 0; i < len; i++) {
        if (str[i] != '0' && str[i] != '1' && str[i] != '2'
            && str[i] != '3' && str[i] != '4' && str[i] != '5'
            && str[i] != '6' && str[i] != '7' && str[i] != '8'
            && str[i] != '9' || (str[i] == '-' && i > 0))
            return (false);
    }
    return (true);
}

// Returns 'true' if the string is a REAL NUMBER.
bool isRealNumber(char* str)
{
    int i, len = strlen(str);
    bool hasDecimal = false;

    if (len == 0)
        return (false);
    for (i = 0; i < len; i++) {
        if (str[i] != '0' && str[i] != '1' && str[i] != '2'
            && str[i] != '3' && str[i] != '4' && str[i] != '5'
            && str[i] != '6' && str[i] != '7' && str[i] != '8'
            && str[i] != '9' && str[i] != '.' ||
            (str[i] == '-' && i > 0))
            return (false);
        if (str[i] == '.')
            hasDecimal = true;
    }
    return (hasDecimal);
}

// Extracts the SUBSTRING.
char* subString(char* str, int left, int right)
{
    int i;
    char* subStr = (char*)malloc(sizeof(char) * (right - left + 2));

    for (i = left; i <= right; i++)
        subStr[i - left] = str[i];
    subStr[right - left + 1] = '\0';
    return (subStr);
}

// Parsing the input STRING.
void parse(char* str)
{
    int left = 0, right = 0;
    int len = strlen(str);

    while (right <= len && left <= right)
    {
        if (isDelimiter(str[right]) == false)
            right++;


        if (isDelimiter(str[right]) == true && left == right)
        {
            if (isOperator(str[right]) == true)
                printf("'%c' IS A OPERATOR\n", str[right]);

            right++;
            left = right;
        }

        if (isDelimiter(str[right]) == true && left == right)
        {
            if (isDelimiter(str[right]) == true)
                printf("'%c' IS A DELIMITER\n", str[right]);

            right++;
            left = right;
        }

        if (isSeperator(str[right]) == true && left == right)
        {
            //needed to recognize seperator to the right
            if (isSeperator(str[right]) == true)
                printf("'%c' IS A SEPERATOR\n", str[right]);

            right++;
            left = right;

            //needed to recognize seperator to the left
            if (isSeperator(str[right]) == true)
                printf("'%c' IS A SEPERATOR\n", str[left]);

            right++;
            left = right;
        }
        else if (isDelimiter(str[right]) == true && left != right
            || (right == len && left != right)) {
            char* subStr = subString(str, left, right - 1);

            if (isKeyword(subStr) == true)
                printf("'%s' IS A KEYWORD\n", subStr);

            else if (isInteger(subStr) == true)
                printf("'%s' IS AN INTEGER\n", subStr);

            else if (isRealNumber(subStr) == true)
                printf("'%s' IS A REAL NUMBER\n", subStr);

            else if (validIdentifier(subStr) == true
                && isDelimiter(str[right - 1]) == false
                && isSeperator(str[right - 1]) == false)
                printf("'%s' IS A VALID IDENTIFIER\n", subStr);

            left = right;
        }
    }
    return;
}

int main(int argc, char *argv)
{

    /* declare a file pointer */
    FILE    *file;
    char    *buffer;
    long    numbytes;

    /* open an existing file for reading */
    file = fopen("Text.txt", "r");

    /* quit if the file does not exist */
    if (file == NULL)
        return 1;

    /* Get the number of bytes */
    fseek(file, 0L, SEEK_END);
    numbytes = ftell(file);

    /* reset the file position indicator to
    the beginning of the file */
    fseek(file, 0L, SEEK_SET);

    /* grab sufficient memory for the
    buffer to hold the text */
    buffer = (char*)calloc(numbytes, sizeof(char));

    /* memory error */
    if (buffer == NULL)
        return 1;

    /* copy all the text into the buffer */
    fread(buffer, sizeof(char), numbytes, file);

    /* confirm we have read the file by
    outputing it to the console */
    printf("  The file called Text.txt contains this text  \n     \n %s             \n\n", buffer);

    parse(buffer); // calling the parse function
    fclose(file);

    /* free the memory we used for the buffer */
    free(buffer);



    return 0;
}

标签: ctokenlexer

解决方案


Looks like the problem is your isDelimiter function isn't picking up all possible values. If you change it to use isspace() it will match on all forms of whitespace.

bool isDelimiter(char ch)
{
    if (isspace(ch))
        return (true);
    return (false);
}

As an example, here is a very simple state machine to give you an idea of what sort of thing I meant. It can be in either of one of two states - INSIDE_IDENTIFIER or OUTSIDE_IDENTIFIER - and it switches between those two states depending on what kind of character it is looking at.

#define OUTSIDE_IDENTIFIER (0)
#define INSIDE_IDENTIFIER (1)

void parse(char *str)
    {
    char *ch;
    int state=OUTSIDE_IDENTIFIER;
    char buffer[1000];
    char *pos=buffer;

    for(ch=str;*ch!='\0';ch++)
        {
        switch(state)
            {
        case INSIDE_IDENTIFIER:
            if(isOperator(*ch))
                {
                *pos='\0';
                printf("Identifier[%s]\n",buffer);
                printf("Operator[%c]\n",*ch);
                state=OUTSIDE_IDENTIFIER;
                }
            else if(isDelimiter(*ch))
                {
                *pos='\0';
                printf("Identifier[%s]\n",buffer);
                printf("Delimiter[%c]\n",*ch);
                state=OUTSIDE_IDENTIFIER;
                }
            else if(isspace(*ch))
                {
                *pos='\0';
                printf("Identifier[%s]\n",buffer);
                printf("Space[%d]\n",*ch);
                state=OUTSIDE_IDENTIFIER;
                }
            else
                {
                *pos=*ch;
                pos++;
                }
        break; 
        case OUTSIDE_IDENTIFIER:
        default:
            if(isOperator(*ch))
                {
                printf("Operator[%c]\n",*ch);
                }
            else if(isDelimiter(*ch))
                {
                printf("Delimiter[%c]\n",*ch);
                }
            else if(isSeperator(*ch))
                {
                printf("Seperator[%c]\n",*ch);
                }
            else
                {
                state = INSIDE_IDENTIFIER;
                pos=buffer;
                *pos=*ch;
                pos++;
                }
        break;
            }
        }
    }

推荐阅读