@@ -759,6 +759,66 @@ xfarray_qsort_push(
759759 return 0 ;
760760}
761761
762+ /*
763+ * Load an element from the array into the first scratchpad and cache the page,
764+ * if possible.
765+ */
766+ static inline int
767+ xfarray_sort_load_cached (
768+ struct xfarray_sortinfo * si ,
769+ xfarray_idx_t idx ,
770+ void * ptr )
771+ {
772+ loff_t idx_pos = xfarray_pos (si -> array , idx );
773+ pgoff_t startpage ;
774+ pgoff_t endpage ;
775+ int error = 0 ;
776+
777+ /*
778+ * If this load would split a page, release the cached page, if any,
779+ * and perform a traditional read.
780+ */
781+ startpage = idx_pos >> PAGE_SHIFT ;
782+ endpage = (idx_pos + si -> array -> obj_size - 1 ) >> PAGE_SHIFT ;
783+ if (startpage != endpage ) {
784+ error = xfarray_sort_put_page (si );
785+ if (error )
786+ return error ;
787+
788+ if (xfarray_sort_terminated (si , & error ))
789+ return error ;
790+
791+ return xfile_obj_load (si -> array -> xfile , ptr ,
792+ si -> array -> obj_size , idx_pos );
793+ }
794+
795+ /* If the cached page is not the one we want, release it. */
796+ if (xfile_page_cached (& si -> xfpage ) &&
797+ xfile_page_index (& si -> xfpage ) != startpage ) {
798+ error = xfarray_sort_put_page (si );
799+ if (error )
800+ return error ;
801+ }
802+
803+ /*
804+ * If we don't have a cached page (and we know the load is contained
805+ * in a single page) then grab it.
806+ */
807+ if (!xfile_page_cached (& si -> xfpage )) {
808+ if (xfarray_sort_terminated (si , & error ))
809+ return error ;
810+
811+ error = xfarray_sort_get_page (si , startpage << PAGE_SHIFT ,
812+ PAGE_SIZE );
813+ if (error )
814+ return error ;
815+ }
816+
817+ memcpy (ptr , si -> page_kaddr + offset_in_page (idx_pos ),
818+ si -> array -> obj_size );
819+ return 0 ;
820+ }
821+
762822/*
763823 * Sort the array elements via quicksort. This implementation incorporates
764824 * four optimizations discussed in Sedgewick:
@@ -784,6 +844,10 @@ xfarray_qsort_push(
784844 * If a small set is contained entirely within a single xfile memory page,
785845 * map the page directly and run heap sort directly on the xfile page
786846 * instead of using the load/store interface. This halves the runtime.
847+ *
848+ * 5. This optimization is specific to the implementation. When converging lo
849+ * and hi after selecting a pivot, we will try to retain the xfile memory
850+ * page between load calls, which reduces run time by 50%.
787851 */
788852
789853/*
@@ -865,19 +929,20 @@ xfarray_sort(
865929 * Decrement hi until it finds an a[hi] less than the
866930 * pivot value.
867931 */
868- error = xfarray_sort_load (si , hi , scratch );
932+ error = xfarray_sort_load_cached (si , hi , scratch );
869933 if (error )
870934 goto out_free ;
871935 while (xfarray_sort_cmp (si , scratch , pivot ) >= 0 &&
872936 lo < hi ) {
873- if (xfarray_sort_terminated (si , & error ))
874- goto out_free ;
875-
876937 hi -- ;
877- error = xfarray_sort_load (si , hi , scratch );
938+ error = xfarray_sort_load_cached (si , hi ,
939+ scratch );
878940 if (error )
879941 goto out_free ;
880942 }
943+ error = xfarray_sort_put_page (si );
944+ if (error )
945+ goto out_free ;
881946
882947 if (xfarray_sort_terminated (si , & error ))
883948 goto out_free ;
@@ -893,19 +958,20 @@ xfarray_sort(
893958 * Increment lo until it finds an a[lo] greater than
894959 * the pivot value.
895960 */
896- error = xfarray_sort_load (si , lo , scratch );
961+ error = xfarray_sort_load_cached (si , lo , scratch );
897962 if (error )
898963 goto out_free ;
899964 while (xfarray_sort_cmp (si , scratch , pivot ) <= 0 &&
900965 lo < hi ) {
901- if (xfarray_sort_terminated (si , & error ))
902- goto out_free ;
903-
904966 lo ++ ;
905- error = xfarray_sort_load (si , lo , scratch );
967+ error = xfarray_sort_load_cached (si , lo ,
968+ scratch );
906969 if (error )
907970 goto out_free ;
908971 }
972+ error = xfarray_sort_put_page (si );
973+ if (error )
974+ goto out_free ;
909975
910976 if (xfarray_sort_terminated (si , & error ))
911977 goto out_free ;
0 commit comments