From e4ee21434d44f77642cafef28b8d9e4bb95cbd90 Mon Sep 17 00:00:00 2001 From: Binx Date: Sun, 14 Jun 2026 15:00:40 +0800 Subject: [PATCH] fix: parse pdf file null char not allowed. --- .../handle/impl/text/pdf_split_handle.py | 5 +++-- apps/knowledge/serializers/paragraph.py | 19 +++++++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/apps/common/handle/impl/text/pdf_split_handle.py b/apps/common/handle/impl/text/pdf_split_handle.py index 725830403b9..dde048887a1 100644 --- a/apps/common/handle/impl/text/pdf_split_handle.py +++ b/apps/common/handle/impl/text/pdf_split_handle.py @@ -228,7 +228,7 @@ def collect_toc(doc, outline, level, toc): title = item.get("/Title") if title is None: title = str(item) - toc.append((level, str(title), page_number)) + toc.append((level, str(title).replace("\0", ""), page_number)) @staticmethod def handle_toc(doc, limit): @@ -522,7 +522,7 @@ def visitor_text(text, cm, tm, font_dict, font_size): except BaseException: return "" - return "".join(text_parts).strip().split("\n")[0].replace(".", "").strip() + return "".join(text_parts).replace("\0", "").strip().split("\n")[0].replace(".", "").strip() @staticmethod def extract_first_line(page): @@ -531,6 +531,7 @@ def extract_first_line(page): @staticmethod def handle_chapter_title(title): + title = title.replace("\0", "") title = re.sub(r"[一二三四五六七八九十\s*]、\s*", "", title) title = re.sub(r"第[一二三四五六七八九十]章\s*", "", title) return title diff --git a/apps/knowledge/serializers/paragraph.py b/apps/knowledge/serializers/paragraph.py index 5525c2dac14..09a30242c77 100644 --- a/apps/knowledge/serializers/paragraph.py +++ b/apps/knowledge/serializers/paragraph.py @@ -48,6 +48,13 @@ from knowledge.task.generate import generate_related_by_paragraph_id_list +class NullCharacterStrippedCharField(serializers.CharField): + def to_internal_value(self, data): + if isinstance(data, str): + data = data.replace("\x00", "") + return super().to_internal_value(data) + + class ParagraphSerializer(serializers.ModelSerializer): class Meta: model = Paragraph @@ -59,10 +66,10 @@ class ParagraphInstanceSerializer(serializers.Serializer): 段落实例对象 """ - content = serializers.CharField( + content = NullCharacterStrippedCharField( required=True, label=_("content"), max_length=102400, min_length=1, allow_null=True, allow_blank=True ) - title = serializers.CharField( + title = NullCharacterStrippedCharField( required=False, max_length=256, label=_("section title"), allow_null=True, allow_blank=True ) problem_list = ProblemInstanceSerializer(required=False, many=True) @@ -70,10 +77,10 @@ class ParagraphInstanceSerializer(serializers.Serializer): class EditParagraphSerializers(serializers.Serializer): - title = serializers.CharField( + title = NullCharacterStrippedCharField( required=False, max_length=256, label=_("section title"), allow_null=True, allow_blank=True ) - content = serializers.CharField( + content = NullCharacterStrippedCharField( required=False, max_length=102400, allow_null=True, allow_blank=True, label=_("section title") ) problem_list = ProblemInstanceSerializer(required=False, many=True) @@ -91,10 +98,10 @@ class ParagraphBatchGenerateRelatedSerializer(serializers.Serializer): class ParagraphSerializers(serializers.Serializer): - title = serializers.CharField( + title = NullCharacterStrippedCharField( required=False, max_length=256, label=_("section title"), allow_null=True, allow_blank=True ) - content = serializers.CharField(required=True, max_length=102400, label=_("section title")) + content = NullCharacterStrippedCharField(required=True, max_length=102400, label=_("section title")) class Problem(serializers.Serializer): workspace_id = serializers.CharField(required=True, label=_("workspace id"))